diff options
Diffstat (limited to 'llvm/test/CodeGen/X86')
233 files changed, 44190 insertions, 39026 deletions
diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll index 1962dde..f2b4c49 100644 --- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll +++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll @@ -36,10 +36,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: Ltmp0: +; CHECK-NEXT: Ltmp0: ## EH_LABEL ; CHECK-NEXT: ## implicit-def: $ebx ; CHECK-NEXT: calll __Znam -; CHECK-NEXT: Ltmp1: +; CHECK-NEXT: Ltmp1: ## EH_LABEL ; CHECK-NEXT: ## %bb.1: ## %bb11 ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movb $1, %al @@ -58,13 +58,13 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: jne LBB0_9 ; CHECK-NEXT: ## %bb.10: ## %bb41 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 -; CHECK-NEXT: Ltmp2: +; CHECK-NEXT: Ltmp2: ## EH_LABEL ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %esi, (%esp) ; CHECK-NEXT: calll _Pjii -; CHECK-NEXT: Ltmp3: +; CHECK-NEXT: Ltmp3: ## EH_LABEL ; CHECK-NEXT: ## %bb.11: ## %bb42 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: xorl %eax, %eax @@ -126,20 +126,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: jmp LBB0_8 ; CHECK-NEXT: LBB0_18: ## %bb43 -; CHECK-NEXT: Ltmp5: +; CHECK-NEXT: Ltmp5: ## EH_LABEL ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp6: +; CHECK-NEXT: Ltmp6: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_3 ; CHECK-NEXT: LBB0_2: ## %bb29 -; CHECK-NEXT: Ltmp7: +; CHECK-NEXT: Ltmp7: ## EH_LABEL ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp8: +; CHECK-NEXT: Ltmp8: ## EH_LABEL ; CHECK-NEXT: LBB0_3: ## %bb30 ; CHECK-NEXT: ud2 ; CHECK-NEXT: LBB0_4: ## %bb20.loopexit -; CHECK-NEXT: Ltmp4: +; CHECK-NEXT: Ltmp4: ## EH_LABEL ; CHECK-NEXT: LBB0_9: ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: LBB0_6: ## %bb23 @@ -151,7 +151,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl ; CHECK-NEXT: LBB0_5: ## %bb20.loopexit.split-lp -; CHECK-NEXT: Ltmp9: +; CHECK-NEXT: Ltmp9: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_6 ; CHECK-NEXT: Lfunc_end0: bb: diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll index c9390d9..2b692bf 100644 --- a/llvm/test/CodeGen/X86/3addr-16bit.ll +++ b/llvm/test/CodeGen/X86/3addr-16bit.ll @@ -10,27 +10,27 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test1: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: incl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: incl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB0_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB0_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test1: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: incl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB0_2 @@ -63,27 +63,27 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: decl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: decl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB1_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB1_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: decl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB1_2 @@ -118,27 +118,27 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test3: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl $2, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl $2, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB2_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB2_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test3: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl $2, %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB2_2 @@ -171,19 +171,19 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test4: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl %edi, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB3_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB3_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test4: @@ -191,8 +191,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: cmpw %cx, %dx ; X86-NEXT: jne LBB3_2 diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll index 06cf968..8a8e7a3 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll @@ -297,30 +297,30 @@ define dso_local void @test6(i16 signext %0) nounwind { ; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movl $buf, %ecx -; CHECK-NEXT: movl $32, %edx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: movl $buf, %edx +; CHECK-NEXT: movl $32, %esi ; CHECK-NEXT: jmp .LBB5_1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB5_3: # %if.false ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: decl %esi +; CHECK-NEXT: decl %eax ; CHECK-NEXT: .LBB5_4: # %loop.bb2 ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: leal (%rdi,%rsi), %r8d +; CHECK-NEXT: leal (%rdi,%rax), %r8d ; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: cmpw $7, %si +; CHECK-NEXT: cmpw $7, %ax ; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx) +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) ; CHECK-NEXT: jne .LBB5_5 ; CHECK-NEXT: .LBB5_1: # %loop.bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne .LBB5_3 ; CHECK-NEXT: # %bb.2: # %if.true ; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1 -; CHECK-NEXT: incl %esi +; CHECK-NEXT: incl %eax ; CHECK-NEXT: jmp .LBB5_4 ; CHECK-NEXT: .LBB5_5: # %exit ; CHECK-NEXT: tilerelease diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll index 87059c5..6ae7b22 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 { ; CHECK-LABEL: @test_no_bitcast( diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll index 5fb2dcd..ca7c357 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) { ; CHECK-LABEL: @test_amx_load_non_O0( diff --git a/llvm/test/CodeGen/X86/GlobalISel/fp-bitcast.ll b/llvm/test/CodeGen/X86/GlobalISel/fp-bitcast.ll new file mode 100644 index 0000000..dad33ca --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/fp-bitcast.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -global-isel -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -global-isel -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 + +define half @test_i16_to_half(i16 %0) { +; SSE2-LABEL: test_i16_to_half: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_i16_to_half: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_i16_to_half: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovd %edi, %xmm0 +; AVX512-NEXT: retq +entry: + %2 = bitcast i16 %0 to half + ret half %2 +} + +define i16 @test_half_to_i16(half %0) { +; SSE2-LABEL: test_half_to_i16: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: # kill: def $eax killed $eax def $ax +; SSE2-NEXT: retq +; +; AVX-LABEL: test_half_to_i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: # kill: def $eax killed $eax def $ax +; AVX-NEXT: retq +; +; AVX512-LABEL: test_half_to_i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $eax killed $eax def $ax +; AVX512-NEXT: retq +entry: + %2 = bitcast half %0 to i16 + ret i16 %2 +} diff --git a/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll new file mode 100644 index 0000000..841c9a6 --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK + +define void @test_reloc_none() { +; CHECK-LABEL: test_reloc_none: +; CHECK: # %bb.0: +; CHECK-NEXT: .Lreloc_none0: +; CHECK-NEXT: .reloc .Lreloc_none0, BFD_RELOC_NONE, foo +; CHECK-NEXT: retq + call void @llvm.reloc.none(metadata !"foo") + ret void +} + +declare void @llvm.reloc.none(metadata) diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir index 41e1b5b..5c059a4 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir @@ -1,5 +1,6 @@ -# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 -# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86 +# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64 --- | @@ -30,24 +31,23 @@ ... --- name: test_copy -# ALL-LABEL: name: test_copy alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -56,24 +56,23 @@ body: | ... --- name: test_copy2 -# ALL-LABEL: name: test_copy2 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy2 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -82,30 +81,35 @@ body: | ... --- name: test_copy3 -# ALL-LABEL: name: test_copy3 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr16 = COPY $ax -# X32-NEXT: %3:gr16_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; X86-LABEL: name: test_copy3 + ; X86: liveins: $eax + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy3 + ; X64: liveins: $eax + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s16) = COPY $ax %1(s8) = G_TRUNC %0(s16) %2(s32) = G_ZEXT %1(s8) @@ -115,27 +119,25 @@ body: | ... --- name: test_copy4 -# ALL-LABEL: name: test_copy4 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $eax -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %2:gr32 = MOVZX32rr16 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy4 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $eax %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ZEXT %1(s16) @@ -145,30 +147,35 @@ body: | ... --- name: test_copy5 -# ALL-LABEL: name: test_copy5 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# X32-NEXT: %3:gr32_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; X86-LABEL: name: test_copy5 + ; X86: liveins: $eax, $edx + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy5 + ; X64: liveins: $eax, $edx + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s8) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s8) @@ -178,29 +185,26 @@ body: | ... --- name: test_copy6 -# ALL-LABEL: name: test_copy6 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %3:low32_addr_access_rbp = IMPLICIT_DEF -# ALL-NEXT: %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; CHECK-LABEL: name: test_copy6 + ; CHECK: liveins: $eax, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit + ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s16) diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 0fbfb42..9223348 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -7,9 +7,11 @@ ; CHECK-LABEL: Pass Arguments: ; CHECK-NEXT: Target Library Information +; CHECK-NEXT: Runtime Library Function Analysis ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Library Function Lowering Analysis ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Profile summary info @@ -68,8 +70,6 @@ ; CHECK-NEXT: X86 Indirect Branch Tracking ; CHECK-NEXT: X86 vzeroupper inserter ; CHECK-NEXT: Compressing EVEX instrs when possible -; CHECK-NEXT: X86 Discriminate Memory Operands -; CHECK-NEXT: X86 Insert Cache Prefetches ; CHECK-NEXT: X86 insert wait instruction ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: Remove Loads Into Fake Uses diff --git a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir index 348a290..2445306 100644 --- a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir +++ b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir @@ -55,7 +55,7 @@ !9 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !10) !10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned) !11 = !DILocation(line: 4, column: 1, scope: !5) - !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) + !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7) ... --- diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll index 97894db..ee44820 100644 --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -1513,3 +1513,32 @@ define i1 @pr84831(i64 %arg) { %trunc = trunc i63 %or to i1 ret i1 %trunc } + +define void @pr169691(ptr %p0, i64 %implicit, i1 zeroext %carry) { +; CHECK-LABEL: pr169691: +; CHECK: # %bb.0: +; CHECK-NEXT: addb $-1, %dl +; CHECK-NEXT: adcq %rsi, (%rdi) +; CHECK-NEXT: adcq %rsi, 8(%rdi) +; CHECK-NEXT: retq + %a0 = load i64, ptr %p0, align 8 + %uaddo0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a0, i64 %implicit) + %uaddo0.1 = extractvalue { i64, i1 } %uaddo0, 1 + %uaddo0.0 = extractvalue { i64, i1 } %uaddo0, 0 + %zextc = zext i1 %carry to i64 + %uaddo0b = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %uaddo0.0, i64 %zextc) + %uaddo0b.1 = extractvalue { i64, i1 } %uaddo0b, 1 + %uaddo0b.0 = extractvalue { i64, i1 } %uaddo0b, 0 + %carry0 = or i1 %uaddo0.1, %uaddo0b.1 + store i64 %uaddo0b.0, ptr %p0, align 8 + + %p1 = getelementptr inbounds nuw i8, ptr %p0, i64 8 + %a1 = load i64, ptr %p1, align 8 + %uaddo1 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a1, i64 %implicit) + %uaddo1.0 = extractvalue { i64, i1 } %uaddo1, 0 + %zext0 = zext i1 %carry0 to i64 + %uaddo1b = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %uaddo1.0, i64 %zext0) + %uaddo1b.0 = extractvalue { i64, i1 } %uaddo1b, 0 + store i64 %uaddo1b.0, ptr %p1, align 8 + ret void +} diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll index 6d0f3c5..caf7a1c 100644 --- a/llvm/test/CodeGen/X86/amx-tf32-internal.ll +++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \ -; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s +; RUN: -mattr=+amx-tf32 -verify-machineinstrs | FileCheck %s define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-LABEL: test_amx: @@ -20,7 +20,6 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { ; CHECK-NEXT: tilezero %tmm1 ; CHECK-NEXT: tilezero %tmm2 ; CHECK-NEXT: tmmultf32ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttmmultf32ps %tmm1, %tmm0, %tmm2 ; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper @@ -31,9 +30,8 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) %c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c1) ret void } @@ -43,4 +41,3 @@ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll index af1a7ae..642c1b7 100644 --- a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32 -verify-machineinstrs | FileCheck %s define void @test_tmmultf32ps() { ; CHECK-LABEL: test_tmmultf32ps: @@ -11,13 +11,3 @@ define void @test_tmmultf32ps() { } declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C) -define void @test_ttmmultf32ps() { -; CHECK-LABEL: test_ttmmultf32ps: -; CHECK: # %bb.0: -; CHECK-NEXT: ttmmultf32ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: retq - call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3) - ret void -} -declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C) - diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll deleted file mode 100755 index 1f5758c..0000000 --- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll +++ /dev/null @@ -1,122 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 -; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 -; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR - -define void @test_amx(i64 %stride, i8* %addr1) #0 { -; CHECK-LABEL: test_amx: -; CHECK: # %bb.0: -; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 -; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 -; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 -; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx: -; EGPR: # %bb.0: -; EGPR-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e] -; EGPR-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e] -; EGPR-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e] -; EGPR-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e] -; EGPR-NEXT: retq # encoding: [0xc3] - call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride) - ret void -} -declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 ) -declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 ) - -define void @test_amx2(i8* %base, i64 %stride) #0 { -; O0-LABEL: test_amx2: -; O0: # %bb.0: -; O0-NEXT: xorps %xmm0, %xmm0 -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 -; O0-NEXT: movw $8, %ax -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: # implicit-def: $al -; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 -; O0-NEXT: tilerelease -; O0-NEXT: retq -; -; O2-LABEL: test_amx2: -; O2: # %bb.0: -; O2-NEXT: xorps %xmm0, %xmm0 -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; O2-NEXT: movw $8, %ax -; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 -; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 -; O2-NEXT: tilerelease -; O2-NEXT: retq -; -; EGPR-LABEL: test_amx2: -; EGPR: # %bb.0: -; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0] -; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37] -; EGPR-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: retq # encoding: [0xc3] - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - ret void -} -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64) diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll deleted file mode 100644 index 4f41410..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll +++ /dev/null @@ -1,136 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s - -@buf = dso_local global [2048 x i8] zeroinitializer, align 16 -@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - -define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { -; CHECK-LABEL: test_tile_2rpntlvwz0: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # kill: def $dx killed $dx killed $edx -; CHECK-NEXT: movw %si, %cx -; CHECK-NEXT: movw %di, %ax -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $cl -; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: # implicit-def: $al -; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $buf, %esi -; CHECK-NEXT: movl $32, %edi -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4 -; CHECK-NEXT: movabsq $64, %rbx -; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload -; CHECK-NEXT: movabsq $64, %rbx -; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) -; CHECK-NEXT: movl $64, %edi -; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 -; CHECK-NEXT: movl $buf2, %edx -; CHECK-NEXT: movl $32, %esi -; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) -; CHECK-NEXT: leaq -8(%rbp), %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3 - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3 - %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3 - %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3 - %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3 - %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3 - %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3 - %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3 - %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3 - %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3 - ret void -} - -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 - -declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - -declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 - -attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" } -attributes #1 = { argmemonly nofree nounwind readonly } -attributes #2 = { nofree nosync nounwind readnone } -attributes #3 = { nounwind } -attributes #4 = { argmemonly nounwind writeonly } - -!llvm.module.flags = !{!0, !1, !2} - -!0 = !{i32 1, !"wchar_size", i32 4} -!1 = !{i32 7, !"uwtable", i32 2} -!2 = !{i32 7, !"frame-pointer", i32 2} diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir deleted file mode 100644 index ab12ab3..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir +++ /dev/null @@ -1,165 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $rdi, $rsi, $rdx, $rax - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $rdi, $rsi, $rdx, $rax - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: renamable $rcx = MOV32ri64 64 - ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) - ; CHECK-NEXT: renamable $cx = MOV16ri 64 - ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) - ; CHECK-NEXT: renamable $cx = MOV16ri 16 - ; CHECK-NEXT: renamable $r8w = MOV16ri 16 - ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4) - ; CHECK-NEXT: $al = IMPLICIT_DEF - ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4) - ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) - ; CHECK-NEXT: renamable $r9 = COPY $rsi - ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - ; CHECK-NEXT: renamable $r8 = COPY $rdi - ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) - ; CHECK-NEXT: renamable $r10 = COPY $rax - ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) - ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg - ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5 - ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx - ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg - ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $zmm0 = AVX512_512_SET0 - VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4) - MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - renamable $rcx = MOV32ri64 64 - MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7) - renamable $cx = MOV16ri 64 - MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5) - renamable $cx = MOV16ri 16 - renamable $r8w = MOV16ri 16 - MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6) - PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4) - renamable $r9 = COPY $rsi - $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7) - renamable $r8 = COPY $rdi - $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6) - renamable $r10 = COPY $rax - $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5) - renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg - renamable $tmm0 = COPY renamable $tmm5 - renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1 - PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx - PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0 - renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg - renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg - renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg - renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2 - PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0 -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir deleted file mode 100644 index c7d241f..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir +++ /dev/null @@ -1,153 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s - ---- | - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { - entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = extractvalue { x86_amx, x86_amx } %0, 1 - %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 - %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5 - ret void - } - - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1 - - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - - declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4 - - attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { nounwind } - -... ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr32, preferred-register: '' } - - { id: 1, class: gr32, preferred-register: '' } - - { id: 2, class: gr32, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr16, preferred-register: '' } - - { id: 5, class: gr16, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 12, class: tile, preferred-register: '' } - - { id: 13, class: gr64, preferred-register: '' } - - { id: 14, class: vr512, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx - - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4) - ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4) - ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4) - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf - ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32 - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2 - ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]] - ; CHECK-NEXT: RET 0 - %2:gr32 = COPY $edx - %1:gr32 = COPY $esi - %0:gr32 = COPY $edi - %14:vr512 = AVX512_512_SET0 - VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4) - MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - %6:gr64 = MOV32ri64 @buf - %7:gr64_nosp = MOV32ri64 32 - %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg - %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit - %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1 - %13:gr64 = MOV32ri64 @buf2 - PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12 - RET 0 - -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir deleted file mode 100644 index 66b15aa..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir +++ /dev/null @@ -1,97 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: [] -liveins: - - { reg: '$edi', virtual-reg: '' } - - { reg: '$esi', virtual-reg: '' } - - { reg: '$edx', virtual-reg: '' } - - { reg: '$cx', virtual-reg: '' } - - { reg: '$r9', virtual-reg: '' } - - { reg: '$r10', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 - - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg - ; CHECK-NEXT: $rax = MOV64ri 64 - ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3) - ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3) - ; CHECK-NEXT: $rax = MOV64ri 64 - ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2) - ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2) - ; CHECK-NEXT: renamable $r8 = MOV32ri64 64 - ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1) - ; CHECK-NEXT: renamable $di = MOV16ri 64 - ; CHECK-NEXT: renamable $cx = MOV16ri 16 - ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 - ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4) - renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg - renamable $tmm0 = COPY renamable $tmm5 - renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5 - renamable $r8 = MOV32ri64 64 - MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68) - renamable $di = MOV16ri 64 - renamable $cx = MOV16ri 16 - PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1 - PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0 - -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll deleted file mode 100644 index 3549875..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll +++ /dev/null @@ -1,87 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s - ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s - - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - - ; Function Attrs: noinline nounwind optnone uwtable - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 { -; CHECK-LABEL: @test_tile_2rpntlvwz0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]]) -; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]]) -; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]]) -; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]]) -; CHECK-NEXT: ret void -; - entry: - - %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7 - store <256 x i32> %2, ptr %m, align 1024 - - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7 - store <256 x i32> %4, ptr %m, align 1024 - - %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7 - %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7 - store <256 x i32> %6, ptr %m, align 64 - - %7 = load <256 x i32>, ptr %m, align 64 - %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7 - %9 = load <256 x i32>, ptr %m, align 64 - %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7 - %11 = load <256 x i32>, ptr %m, align 64 - %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7 - - %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7 - %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7 - store <256 x i32> %14, ptr %m, align 64 - - ret void - } - - ; Function Attrs: argmemonly nounwind readonly - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2 - - ; Function Attrs: nounwind readnone - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4 - - ; Function Attrs: nounwind readnone - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3 - - ; Function Attrs: argmemonly nounwind writeonly - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5 - - attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #7 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll deleted file mode 100644 index 96966264..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll +++ /dev/null @@ -1,61 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s -; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s - - @buf = dso_local global [2048 x i8] zeroinitializer, align 16 - @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 - - ; Function Attrs: nounwind uwtable - define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { -; CHECK-LABEL: @test_tile_2rpntlvwz0( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]] -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]] -; CHECK-NEXT: ret void -; - entry: - %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5 - %1 = extractvalue { x86_amx, x86_amx } %0, 0 - %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5 - %3 = extractvalue { x86_amx, x86_amx } %0, 1 - %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5 - %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5 - %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5 - %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5 - %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5 - %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5 - %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5 - %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5 - %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5 - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5 - ret void - } - - ; Function Attrs: argmemonly nounwind readonly - declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 - - ; Function Attrs: nounwind readnone - declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 - - ; Function Attrs: nounwind - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 - - ; Function Attrs: nounwind readnone - declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 - - ; Function Attrs: argmemonly nounwind writeonly - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 - - attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" } - attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" } - attributes #5 = { nounwind } diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir deleted file mode 100644 index 1e3b242..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir +++ /dev/null @@ -1,134 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr64_nosp, preferred-register: '' } - - { id: 1, class: gr16, preferred-register: '' } - - { id: 2, class: gr16, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr64, preferred-register: '' } - - { id: 5, class: gr64, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 181, class: tile, preferred-register: '' } - - { id: 183, class: tile, preferred-register: '' } - - { id: 185, class: tile, preferred-register: '' } - - { id: 186, class: tile, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1024 - adjustsStack: false - hasCalls: true - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: - - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 21, name: '', type: default, offset: 0, size: 8, - alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $rdi, $rsi, $rdx, $rax - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $rdi, $rsi, $rdx, $rax - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4) - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64 - ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64 - ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16 - ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16 - ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]] - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]] - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]] - ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]] - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]] - ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] - %0:gr64_nosp = MOV32ri64 64 - %1:gr16 = MOV16ri 64 - %2:gr16 = MOV16ri 16 - %3:gr16 = MOV16ri 16 - %4:gr64 = COPY $rsi - %5:gr64 = COPY $rdi - %6:gr64 = COPY $rdx - %7:gr64_nosp = COPY $rax - %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg - %9:tile = COPY %8.sub_t1 - %10:tile = COPY %8.sub_t0 - PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10 - PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9 - %11:tile = PTILEZEROV %1, %2 - PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11 - %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg - %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg - %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg - %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185 - PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186 -... diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir deleted file mode 100644 index ac2cdb4..0000000 --- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir +++ /dev/null @@ -1,113 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ -# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s - ---- -name: test_tile_2rpntlvwz0 -alignment: 16 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false -tracksRegLiveness: true -hasWinCFI: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -failsVerification: false -tracksDebugUserValues: false -registers: - - { id: 0, class: gr32, preferred-register: '' } - - { id: 1, class: gr32, preferred-register: '' } - - { id: 2, class: gr32, preferred-register: '' } - - { id: 3, class: gr16, preferred-register: '' } - - { id: 4, class: gr16, preferred-register: '' } - - { id: 5, class: gr16, preferred-register: '' } - - { id: 6, class: gr64, preferred-register: '' } - - { id: 7, class: gr64_nosp, preferred-register: '' } - - { id: 8, class: tilepair, preferred-register: '' } - - { id: 9, class: tile, preferred-register: '' } - - { id: 10, class: tile, preferred-register: '' } - - { id: 11, class: tile, preferred-register: '' } - - { id: 12, class: tile, preferred-register: '' } - - { id: 13, class: gr64, preferred-register: '' } -liveins: - - { reg: '$edi', virtual-reg: '%0' } - - { reg: '$esi', virtual-reg: '%1' } - - { reg: '$edx', virtual-reg: '%2' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 1 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - localFrameSize: 0 - savePoint: [] - restorePoint: [] -fixedStack: [] -stack: [] -callSites: [] -debugValueSubstitutions: [] -constants: [] -machineFunctionInfo: - amxProgModel: ManagedRA -body: | - bb.0.entry: - liveins: $edi, $esi, $edx, $rax, $rbx - - ; CHECK-LABEL: name: test_tile_2rpntlvwz0 - ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0 - ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit - ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4) - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32 - ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0 - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]] - ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx - ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]] - ; CHECK-NEXT: RET 0 - %2:gr32 = COPY $edx - %1:gr32 = COPY $esi - %0:gr32 = COPY $edi - %3:gr16 = COPY %2.sub_16bit - %4:gr16 = COPY %1.sub_16bit - %5:gr16 = COPY %0.sub_16bit - %6:gr64 = COPY $rax - %7:gr64_nosp = MOV32ri64 32 - %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg - %9:tile = COPY %8.sub_t1 - %10:tile = COPY %8.sub_t0 - %11:tile = PTILEZEROV %5, %4 - %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9 - %13:gr64 = COPY $rbx - PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12 - RET 0 - -... diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll deleted file mode 100644 index 4cfd97a..0000000 --- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll +++ /dev/null @@ -1,371 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR - -define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { -; CHECK-LABEL: test_amx: -; CHECK: # %bb.0: -; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 -; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 -; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 -; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 -; CHECK-NEXT: ttransposed %tmm3, %tmm1 -; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 -; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 -; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1 -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx: -; EGPR: # %bb.0: -; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31] -; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31] -; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31] -; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31] -; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb] -; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca] -; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5] -; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca] -; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca] -; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca] -; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca] -; EGPR-NEXT: retq # encoding: [0xc3] - call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) - call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride) - call void @llvm.x86.ttransposed(i8 1, i8 3) - call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6) - call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3) - call void @llvm.x86.tconjtfp16(i8 1, i8 2) - ret void -} - -declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride) -declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1) -declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2) -declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2) -declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C) -declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B) - -define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx2: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 -; CHECK-NEXT: tilezero %tmm1 -; CHECK-NEXT: tilezero %tmm2 -; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: movabsq $64, %rbp -; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill -; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload -; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 -; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0 -; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) -; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx2: -; EGPR: # %bb.0: -; EGPR-NEXT: pushq %rbp # encoding: [0x55] -; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00] -; EGPR-NEXT: # imm = 0xB70 -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d] -; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08] -; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00] -; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] -; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8] -; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0] -; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0] -; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0] -; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0] -; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0] -; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill -; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00] -; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload -; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00] -; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8] -; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3] -; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17] -; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00] -; EGPR-NEXT: # imm = 0xB70 -; EGPR-NEXT: popq %rbp # encoding: [0x5d] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) - %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b) - %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b) - %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b) - %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5) - - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4) - ret void -} - -define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx3: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movw $8, %cx -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: ttransposed %tmm4, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx3: -; EGPR: # %bb.0: -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] -; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00] -; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] -; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16] -; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4] -; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17] -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) - %5 = extractvalue { x86_amx, x86_amx } %4, 0 - %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6) - ret void -} - -define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { -; CHECK-LABEL: test_amx_spill: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8 -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 -; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: movabsq $64, %rcx -; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill -; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx) -; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx) -; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -; -; EGPR-LABEL: test_amx_spill: -; EGPR: # %bb.0: -; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00] -; EGPR-NEXT: # imm = 0x17C8 -; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] -; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe] -; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00] -; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08] -; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00] -; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80] -; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] -; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] -; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16] -; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00] -; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16] -; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00] -; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16] -; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0] -; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00] -; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00] -; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00] -; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0] -; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload -; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00] -; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] -; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] -; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16] -; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16] -; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00] -; EGPR-NEXT: # imm = 0x17C8 -; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] -; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; EGPR-NEXT: retq # encoding: [0xc3] - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) - %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) - %e11 = extractvalue { x86_amx, x86_amx } %b1, 0 - %e12 = extractvalue { x86_amx, x86_amx } %b1, 1 - %e21 = extractvalue { x86_amx, x86_amx } %b2, 0 - %e22 = extractvalue { x86_amx, x86_amx } %b2, 1 - %e31 = extractvalue { x86_amx, x86_amx } %b3, 0 - %e32 = extractvalue { x86_amx, x86_amx } %b3, 1 - %e41 = extractvalue { x86_amx, x86_amx } %b4, 0 - %e42 = extractvalue { x86_amx, x86_amx } %b4, 1 - %e51 = extractvalue { x86_amx, x86_amx } %b5, 0 - %e52 = extractvalue { x86_amx, x86_amx } %b5, 1 - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52) - ret void -} - -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64) -declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64) -declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx) -declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx) - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir index c0ecfac0..e0873d3 100644 --- a/llvm/test/CodeGen/X86/apx/compress-evex.mir +++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir @@ -139,3 +139,11 @@ body: | $ax = XOR16rr_ND $ax, killed $di, implicit-def dead $eflags RET64 $rax ... +--- +name: setzuccm_2_setccm +body: | + bb.0.entry: + liveins: $eflags + ; CHECK: sete 7(%rsp) # EVEX TO LEGACY Compression encoding: [0x0f,0x94,0x44,0x24,0x07] + SETZUCCm $rsp, 1, $noreg, 7, $noreg, 4, implicit killed $eflags +... diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll index 805fc7c..6f31aef 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll @@ -1,76 +1,80 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX define i32 @map0(ptr nocapture noundef readonly %a, i64 noundef %b) { - ; SSE-LABEL: name: map0 - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $rdi, $rsi - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi - ; SSE-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; SSE-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr) - ; SSE-NEXT: $eax = COPY [[MOV32rm]] - ; SSE-NEXT: RET 0, $eax - ; AVX-LABEL: name: map0 - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $rdi, $rsi - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi - ; AVX-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; AVX-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr) - ; AVX-NEXT: $eax = COPY [[MOV32rm]] - ; AVX-NEXT: RET 0, $eax +; CHECK-LABEL: map0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %r16 # encoding: [0xd5,0x18,0x89,0xf0] +; CHECK-NEXT: movq %rdi, %r17 # encoding: [0xd5,0x18,0x89,0xf9] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl (%r17,%r16,4), %eax # encoding: [0xd5,0x30,0x8b,0x04,0x81] +; CHECK-NEXT: retq # encoding: [0xc3] entry: %add.ptr = getelementptr inbounds i32, ptr %a, i64 %b + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %0 = load i32, ptr %add.ptr ret i32 %0 } -define i32 @map1_or_vex(<2 x double> noundef %a) { - ; SSE-LABEL: name: map1_or_vex - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $xmm0 - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0 - ; SSE-NEXT: [[CVTSD2SIrr_Int:%[0-9]+]]:gr32 = nofpexcept CVTSD2SIrr_Int [[COPY]], implicit $mxcsr - ; SSE-NEXT: $eax = COPY [[CVTSD2SIrr_Int]] - ; SSE-NEXT: RET 0, $eax - ; AVX-LABEL: name: map1_or_vex - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $xmm0 - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0 - ; AVX-NEXT: [[VCVTSD2SIrr_Int:%[0-9]+]]:gr32_norex2 = nofpexcept VCVTSD2SIrr_Int [[COPY]], implicit $mxcsr - ; AVX-NEXT: $eax = COPY [[VCVTSD2SIrr_Int]] - ; AVX-NEXT: RET 0, $eax +define i32 @map1_or_vex(<2 x double> noundef %a) nounwind { +; SSE-LABEL: map1_or_vex: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsd2si %xmm0, %r16d # encoding: [0xf2,0xd5,0xc0,0x2d,0xc0] +; SSE-NEXT: #APP +; SSE-NEXT: nop # encoding: [0x90] +; SSE-NEXT: #NO_APP +; SSE-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0] +; SSE-NEXT: retq # encoding: [0xc3] +; +; AVX-LABEL: map1_or_vex: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx # encoding: [0x53] +; AVX-NEXT: vcvtsd2si %xmm0, %ebx # encoding: [0xc5,0xfb,0x2d,0xd8] +; AVX-NEXT: #APP +; AVX-NEXT: nop # encoding: [0x90] +; AVX-NEXT: #NO_APP +; AVX-NEXT: movl %ebx, %eax # encoding: [0x89,0xd8] +; AVX-NEXT: popq %rbx # encoding: [0x5b] +; AVX-NEXT: retq # encoding: [0xc3] entry: %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a) + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() ret i32 %0 } -define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) { - ; SSE-LABEL: name: map2_or_vex - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $rdi, $rsi - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi - ; SSE-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; SSE-NEXT: [[PABSBrm:%[0-9]+]]:vr128 = PABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr) - ; SSE-NEXT: $xmm0 = COPY [[PABSBrm]] - ; SSE-NEXT: RET 0, $xmm0 - ; AVX-LABEL: name: map2_or_vex - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $rdi, $rsi - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi - ; AVX-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; AVX-NEXT: [[VPABSBrm:%[0-9]+]]:vr128 = VPABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr) - ; AVX-NEXT: $xmm0 = COPY [[VPABSBrm]] - ; AVX-NEXT: RET 0, $xmm0 +define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) nounwind { +; SSE-LABEL: map2_or_vex: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %r14 # encoding: [0x41,0x56] +; SSE-NEXT: pushq %rbx # encoding: [0x53] +; SSE-NEXT: movq %rsi, %rbx # encoding: [0x48,0x89,0xf3] +; SSE-NEXT: movq %rdi, %r14 # encoding: [0x49,0x89,0xfe] +; SSE-NEXT: #APP +; SSE-NEXT: nop # encoding: [0x90] +; SSE-NEXT: #NO_APP +; SSE-NEXT: pabsb (%r14,%rbx,4), %xmm0 # encoding: [0x66,0x41,0x0f,0x38,0x1c,0x04,0x9e] +; SSE-NEXT: popq %rbx # encoding: [0x5b] +; SSE-NEXT: popq %r14 # encoding: [0x41,0x5e] +; SSE-NEXT: retq # encoding: [0xc3] +; +; AVX-LABEL: map2_or_vex: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %r14 # encoding: [0x41,0x56] +; AVX-NEXT: pushq %rbx # encoding: [0x53] +; AVX-NEXT: movq %rsi, %rbx # encoding: [0x48,0x89,0xf3] +; AVX-NEXT: movq %rdi, %r14 # encoding: [0x49,0x89,0xfe] +; AVX-NEXT: #APP +; AVX-NEXT: nop # encoding: [0x90] +; AVX-NEXT: #NO_APP +; AVX-NEXT: vpabsb (%r14,%rbx,4), %xmm0 # encoding: [0xc4,0xc2,0x79,0x1c,0x04,0x9e] +; AVX-NEXT: popq %rbx # encoding: [0x5b] +; AVX-NEXT: popq %r14 # encoding: [0x41,0x5e] +; AVX-NEXT: retq # encoding: [0xc3] entry: + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %add.ptr = getelementptr inbounds i32, ptr %b, i64 %c %a = load <2 x i64>, ptr %add.ptr %0 = bitcast <2 x i64> %a to <16 x i8> diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll index 5fa4cb4..a6ab98f 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll @@ -1,17 +1,20 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr --show-mc-encoding | FileCheck %s -define dso_local void @amx(ptr noundef %data) { - ; CHECK-LABEL: name: amx - ; CHECK: bb.0.entry: - ; CHECK-NEXT: liveins: $rdi - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_norex2_nosp = MOV32ri64 8 - ; CHECK-NEXT: PTILELOADD 4, [[COPY]], 1, killed [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: RET 0 - entry: +define dso_local void @amx(ptr noundef %data) nounwind { +; CHECK-LABEL: amx: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx # encoding: [0x53] +; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl $8, %eax # encoding: [0xb8,0x08,0x00,0x00,0x00] +; CHECK-NEXT: tileloadd (%rbx,%rax), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x24,0x03] +; CHECK-NEXT: popq %rbx # encoding: [0x5b] +; CHECK-NEXT: retq # encoding: [0xc3] +entry: + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.tileloadd64(i8 4, ptr %data, i64 8) ret void } diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll index a9ca591..e7bc0c3 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll @@ -1,17 +1,22 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr --show-mc-encoding | FileCheck %s -define void @x87(ptr %0, ptr %1) { - ; CHECK-LABEL: name: x87 - ; CHECK: bb.0 (%ir-block.2): - ; CHECK-NEXT: liveins: $rdi, $rsi - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rsi - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m [[COPY1]], 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s32) from %ir.0) - ; CHECK-NEXT: nofpexcept ST_Fp32m [[COPY]], 1, $noreg, 0, $noreg, killed [[LD_Fp32m]], implicit-def dead $fpsw, implicit $fpcw :: (store (s32) into %ir.1) - ; CHECK-NEXT: RET 0 +define void @x87(ptr %0, ptr %1) nounwind { +; CHECK-LABEL: x87: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %r14 # encoding: [0x41,0x56] +; CHECK-NEXT: pushq %rbx # encoding: [0x53] +; CHECK-NEXT: movq %rsi, %rbx # encoding: [0x48,0x89,0xf3] +; CHECK-NEXT: movq %rdi, %r14 # encoding: [0x49,0x89,0xfe] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: flds (%r14) # encoding: [0x41,0xd9,0x06] +; CHECK-NEXT: fstps (%rbx) # encoding: [0xd9,0x1b] +; CHECK-NEXT: popq %rbx # encoding: [0x5b] +; CHECK-NEXT: popq %r14 # encoding: [0x41,0x5e] +; CHECK-NEXT: retq # encoding: [0xc3] + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %3 = load float, ptr %0 store float %3, ptr %1 ret void diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll index 8653442..9b89bce 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll @@ -1,70 +1,81 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr --show-mc-encoding | FileCheck %s -define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xsave - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XSAVE [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xsave: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx # encoding: [0x53] +; CHECK-NEXT: movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0] +; CHECK-NEXT: movl %esi, %edx # encoding: [0x89,0xf2] +; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0] +; CHECK-NEXT: xsave (%rbx) # encoding: [0x0f,0xae,0x23] +; CHECK-NEXT: popq %rbx # encoding: [0x5b] +; CHECK-NEXT: retq # encoding: [0xc3] + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xsave(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xsave(ptr, i32, i32) -define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xsave64 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XSAVE64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xsave64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx # encoding: [0x53] +; CHECK-NEXT: movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0] +; CHECK-NEXT: movl %esi, %edx # encoding: [0x89,0xf2] +; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0] +; CHECK-NEXT: xsave64 (%rbx) # encoding: [0x48,0x0f,0xae,0x23] +; CHECK-NEXT: popq %rbx # encoding: [0x5b] +; CHECK-NEXT: retq # encoding: [0xc3] + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xsave64(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xsave64(ptr, i32, i32) -define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xrstor - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XRSTOR [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xrstor: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx # encoding: [0x53] +; CHECK-NEXT: movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0] +; CHECK-NEXT: movl %esi, %edx # encoding: [0x89,0xf2] +; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0] +; CHECK-NEXT: xrstor (%rbx) # encoding: [0x0f,0xae,0x2b] +; CHECK-NEXT: popq %rbx # encoding: [0x5b] +; CHECK-NEXT: retq # encoding: [0xc3] + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xrstor(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xrstor(ptr, i32, i32) -define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xrstor64 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XRSTOR64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xrstor64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx # encoding: [0x53] +; CHECK-NEXT: movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0] +; CHECK-NEXT: movl %esi, %edx # encoding: [0x89,0xf2] +; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb] +; CHECK-NEXT: #APP +; CHECK-NEXT: nop # encoding: [0x90] +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0] +; CHECK-NEXT: xrstor64 (%rbx) # encoding: [0x48,0x0f,0xae,0x2b] +; CHECK-NEXT: popq %rbx # encoding: [0x5b] +; CHECK-NEXT: retq # encoding: [0xc3] + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xrstor64(ptr %ptr, i32 %hi, i32 %lo) ret void; } diff --git a/llvm/test/CodeGen/X86/apx/setzucc.ll b/llvm/test/CodeGen/X86/apx/setzucc.ll index 6eb2d69..d32ccf8 100644 --- a/llvm/test/CodeGen/X86/apx/setzucc.ll +++ b/llvm/test/CodeGen/X86/apx/setzucc.ll @@ -89,3 +89,15 @@ bb1: bb2: ret i32 0 } + +define i32 @highmask_i32_mask32(i32 %val) { +; CHECK-LABEL: highmask_i32_mask32: +; CHECK: # %bb.0: +; CHECK-NEXT: testl $-1048576, %edi # imm = 0xFFF00000 +; CHECK-NEXT: setzune %al +; CHECK-NEXT: retq + %and = and i32 %val, -1048576 + %cmp = icmp ne i32 %and, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll index b4d40fe..71887e3 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -2156,15 +2156,17 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB34_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: xorl %edx, %ecx +; X64-NEXT: xorl %esi, %ecx ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: lock cmpxchgw %cx, (%rdi) ; X64-NEXT: # kill: def $ax killed $ax def $eax @@ -2172,12 +2174,12 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %ecx ; X64-NEXT: movw $123, %ax -; X64-NEXT: testl %ecx, %edx +; X64-NEXT: testl %ecx, %esi ; X64-NEXT: je .LBB34_3 ; X64-NEXT: # %bb.4: # %return ; X64-NEXT: retq ; X64-NEXT: .LBB34_3: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq entry: @@ -3398,10 +3400,12 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_brnz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movl $-2, %r8d +; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: roll %cl, %r8d ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 @@ -3415,10 +3419,10 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-NEXT: jne .LBB52_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: testl %eax, %edx +; X64-NEXT: testl %eax, %esi ; X64-NEXT: je .LBB52_3 ; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq ; X64-NEXT: .LBB52_3: diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll index 105ee7f..e118f5d 100644 --- a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll +++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll @@ -46,8 +46,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x ; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx) ; CHECK-NEXT: setne %cl -; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: pinsrw $0, %edx, %xmm0 ; CHECK-NEXT: pinsrw $0, %eax, %xmm1 ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne .LBB0_1 diff --git a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll index 76d84c1..860d60f 100644 --- a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll @@ -97,3 +97,99 @@ define <16 x i32>@test_int_x86_avx10_vpdpbuuds_512(<16 x i32> %x0, <16 x i32> %x %res = call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) ret <16 x i32> %res } + +declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwsud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwsud_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwsud_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwsuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwsuds_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwsuds_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwusd_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwusd_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwusds_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwusds_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwuud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwuud_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwuud_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwuuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwuuds_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwuuds_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll index 79849a7..d9b4635 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll @@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08] ; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll index a2aad60..e9c6cb6 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll @@ -220,7 +220,7 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8 ; VNNI INT16 -define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { +define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) { ; X86-LABEL: test_mm512_dpwsud_epi32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] @@ -231,12 +231,12 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr ; X64: # %bb.0: ; X64-NEXT: vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07] ; X64-NEXT: retq # encoding: [0xc3] - %__B = load <16 x i32>, ptr %pB - %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %__B = load <32 x i16>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) ret <16 x i32> %res } -define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) { +define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) { ; X86-LABEL: test_mm512_mask_dpwsuds_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] @@ -248,13 +248,13 @@ define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %_ ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W ret <16 x i32> %res } -define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) { +define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) { ; X86-LABEL: test_mm512_maskz_dpwsud_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] @@ -266,14 +266,14 @@ define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %_ ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer ret <16 x i32> %res } -declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>) -declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>) define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { ; X86-LABEL: test_mm512_dpwusd_epi32: diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll index 0f2c75b..01b7618 100644 --- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll @@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08] ; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1] @@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8 ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08] ; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll index 1f270d5..bf7f937 100644 --- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll @@ -334,7 +334,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>) ; VNNI INT16 -define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { +define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) { ; X86-LABEL: test_mm_mask_dpwsud_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] @@ -346,13 +346,13 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W ret <4 x i32> %res } -define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) { +define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) { ; X86-LABEL: test_mm_maskz_dpwsuds_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] @@ -364,13 +364,13 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, < ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer ret <4 x i32> %res } -define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) { +define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) { ; X86-LABEL: test_mm256_maskz_dpwsuds_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] @@ -382,13 +382,13 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W ret <8 x i32> %res } -define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) { +define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) { ; X86-LABEL: test_mm256_mask_dpwsud_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] @@ -400,16 +400,16 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>) define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { ; X86-LABEL: test_mm_mask_dpwusd_epi32: diff --git a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll index 38d54cf..00db1fb 100644 --- a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll @@ -652,14 +652,14 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128(<2 x double> %x0, <4 x i32 ; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8] +; X64-NEXT: vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8] ; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] ; ; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8] +; X86-NEXT: vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8] ; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> %src, i8 %mask) @@ -670,13 +670,13 @@ define <4 x i32> @test_int_x86_maskz_vcvtt_pd2udqs_128_z(<2 x double> %x0, i8 %m ; X64-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0] +; X64-NEXT: vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; ; X86-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0] +; X86-NEXT: vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0] ; X86-NEXT: retl # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> zeroinitializer, i8 %mask) ret <4 x i32> %res @@ -686,13 +686,13 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128_undef(<2 x double> %x0, i8 ; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0] +; X64-NEXT: vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0] ; X64-NEXT: retq # encoding: [0xc3] ; ; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0] +; X86-NEXT: vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0] ; X86-NEXT: retl # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> undef, i8 %mask) ret <4 x i32> %res diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index 1133cdfd..d21df472 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -121,14 +121,13 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone { define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: mul_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3 -; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3 -; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2 +; CHECK-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %x = mul <32 x i8> %i, %j ret <32 x i8> %x diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll index 77053e2..4dd883a 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll @@ -255,8 +255,8 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) ; CHECK-LABEL: gather_qps: ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k2 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} @@ -520,7 +520,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1, ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 @@ -772,7 +772,7 @@ define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, < ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -788,7 +788,7 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, < ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -800,9 +800,9 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, < define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) { ; CHECK-LABEL: scatter_mask_test: ; CHECK: ## %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: kxorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovd %eax, %k1 diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index df71e3c..5ed91ea 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -251,9 +251,9 @@ define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8 define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) { ; CHECK-LABEL: gather_qps: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: kxnorw %k0, %k0, %k2 +; CHECK-NEXT: kxnorb %k0, %k0, %k2 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} @@ -523,7 +523,7 @@ define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 @@ -774,7 +774,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -789,7 +789,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -802,9 +802,9 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) { ; CHECK-LABEL: scatter_mask_test: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} -; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: kxorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovd %eax, %k1 @@ -856,7 +856,7 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %b define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) { ; CHECK-LABEL: gather_global: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir new file mode 100644 index 0000000..0d8f217 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir @@ -0,0 +1,26 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +# RUN: llc %s -mtriple=i386-- -start-before=postrapseudos -o - | FileCheck %s + +--- | + target triple = "i386-unknown-unknown" + + define void @setallones() #0 { + ; CHECK-LABEL: setallones: + ; CHECK: # %bb.0: + ; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 + ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 + entry: + unreachable + } + + attributes #0 = { "target-features"="+avx512f,+avx512vl" } +--- +name: setallones +tracksRegLiveness: true +liveins: [] +body: | + bb.0: + $xmm0 = AVX512_128_SETALLONES + $ymm1 = AVX512_256_SETALLONES + +... diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll new file mode 100644 index 0000000..ca5f319 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -0,0 +1,229 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQBW + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) +declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>) +declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>) + +; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ) +define <8 x float> @mask_v8i1_allones(ptr %ptr) { +; AVX512F-LABEL: mask_v8i1_allones: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v8i1_allones: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v8i1_allones: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v8i1_allones: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQBW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQBW-NEXT: retq + %res = call <8 x float> @llvm.masked.expandload.v8f32(ptr %ptr, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> zeroinitializer) + ret <8 x float> %res +} + +; Test case 2: v16i1 with lower 8 bits set via bitconvert (should use kxnorb on AVX512DQ) +define <16 x float> @mask_v16i1_lower8(ptr %ptr) { +; AVX512F-LABEL: mask_v16i1_lower8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v16i1_lower8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v16i1_lower8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v16i1_lower8: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512DQBW-NEXT: retq + %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> zeroinitializer) + ret <16 x float> %res +} + +; Test case 3: v16i1 with all bits set (should use kxnorw on all targets) +define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512-LABEL: gather_all: +; AVX512: # %bb.0: +; AVX512-NEXT: kxnorw %k0, %k0, %k1 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison) + ret <16 x float> %res +} + +; Test case 4: v8i1 with lower 8 bits set in gather (should use kxnorb on AVX512DQ targets) +define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512F-LABEL: gather_lower: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: gather_lower: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: gather_lower: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: gather_lower: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQBW-NEXT: retq + %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> poison) + ret <16 x float> %res +} + +; Test case 5: v32i1 mask via bitconvert combined with dynamic condition. +; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle. +define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { +; AVX512F-LABEL: mask_v32i1_lower16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v32i1_lower16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v32i1_lower16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl $65535, %eax # imm = 0xFFFF +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: kord %k0, %k1, %k1 +; AVX512BW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v32i1_lower16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k0 +; AVX512DQBW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1 +; AVX512DQBW-NEXT: kord %k0, %k1, %k1 +; AVX512DQBW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; AVX512DQBW-NEXT: retq + %mask0 = bitcast i32 65535 to <32 x i1> + %mask1 = icmp sgt <32 x i16> %c, %d + %mask = or <32 x i1> %mask0, %mask1 + %res = select <32 x i1> %mask, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +; Test case 6: v64i1 mask via bitconvert combined with dynamic condition. +; Verifies the KSET1D submask pattern survives past SelectionDAG combines. +define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { +; AVX512F-LABEL: mask_v64i1_lower32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: mask_v64i1_lower32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2 +; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: mask_v64i1_lower32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; AVX512BW-NEXT: kmovq %rax, %k0 +; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: korq %k0, %k1, %k1 +; AVX512BW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: mask_v64i1_lower32: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnord %k0, %k0, %k0 +; AVX512DQBW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1 +; AVX512DQBW-NEXT: korq %k0, %k1, %k1 +; AVX512DQBW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} +; AVX512DQBW-NEXT: retq + %mask0 = bitcast i64 4294967295 to <64 x i1> + %mask1 = icmp sgt <64 x i8> %c, %d + %mask = or <64 x i1> %mask0, %mask1 + %res = select <64 x i1> %mask, <64 x i8> %a, <64 x i8> %b + ret <64 x i8> %res +} + diff --git a/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir new file mode 100644 index 0000000..7e5ddc4 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir @@ -0,0 +1,30 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +# RUN: llc %s -mtriple=x86_64-- -start-before=postrapseudos -o - | FileCheck %s + +--- | + target triple = "x86_64-unknown-unknown" + + define void @setallones() #0 { + ; CHECK-LABEL: setallones: + ; CHECK: # %bb.0: + ; CHECK-NEXT: vpcmpeqd %xmm14, %xmm14, %xmm14 + ; CHECK-NEXT: vpternlogd {{.*#+}} xmm16 = -1 + ; CHECK-NEXT: vpcmpeqd %ymm15, %ymm15, %ymm15 + ; CHECK-NEXT: vpternlogd {{.*#+}} ymm17 = -1 + entry: + unreachable + } + + attributes #0 = { "target-features"="+avx512f,+avx512vl" } +--- +name: setallones +tracksRegLiveness: true +liveins: [] +body: | + bb.0: + $xmm14 = AVX512_128_SETALLONES + $xmm16 = AVX512_128_SETALLONES + $ymm15 = AVX512_256_SETALLONES + $ymm17 = AVX512_256_SETALLONES + +... diff --git a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll index a24c1d8..7fb2041 100644 --- a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -52,13 +52,12 @@ define <8 x i1> @test3(<4 x i1> %a) { define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovd2m %xmm1, %k0 -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vpmovd2m %xmm0, %k1 -; CHECK-NEXT: kshiftlb $4, %k0, %k0 -; CHECK-NEXT: korb %k0, %k1, %k0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpslld $31, %ymm0, %ymm0 +; CHECK-NEXT: vpmovd2m %ymm0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -68,13 +67,12 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) { define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1 -; CHECK-NEXT: vpmovq2m %xmm1, %k0 -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 -; CHECK-NEXT: vpmovq2m %xmm0, %k1 -; CHECK-NEXT: kshiftlb $2, %k0, %k0 -; CHECK-NEXT: korw %k0, %k1, %k0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0 +; CHECK-NEXT: vpmovq2m %ymm0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3> diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll index b8ebe2a..ddf0050 100644 --- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll @@ -178,18 +178,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ret { <4 x i32>, <4 x i32> } %res2 } -declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x52,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) ret <8 x i32> %1 } -define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) { +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] @@ -209,11 +209,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; X64-NEXT: vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda] ; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <8 x i32>, ptr %x2p - %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %x2 = load <16 x i16>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 - %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4) %5 = bitcast i8 %x3 to <8 x i1> %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 @@ -221,18 +221,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ret { <8 x i32>, <8 x i32> } %res2 } -declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>) -define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x52,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) ret <4 x i32> %1 } -define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) { +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] @@ -252,12 +252,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; X64-NEXT: vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda] ; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <8 x i16>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 - %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4) %5 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer @@ -266,18 +266,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ret { <4 x i32>, <4 x i32> } %res2 } -declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x53,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) ret <8 x i32> %1 } -define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) { +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] @@ -297,11 +297,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; X64-NEXT: vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda] ; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <8 x i32>, ptr %x2p - %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %x2 = load <16 x i16>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 - %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4) %5 = bitcast i8 %x3 to <8 x i1> %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 @@ -309,9 +309,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ret { <8 x i32>, <8 x i32> } %res2 } -declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>) -define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) { +define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p) { ; X86-LABEL: test_int_x86_avx512_vpdpwssds_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] @@ -322,12 +322,12 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ; X64: # %bb.0: ; X64-NEXT: vpdpwssds (%rdi), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x53,0x07] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <8 x i16>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) ret <4 x i32> %1 } -define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) { +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] @@ -347,12 +347,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; X64-NEXT: vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda] ; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <8 x i16>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 - %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4) %5 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll index 63ff88a..2aabfab 100644 --- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll @@ -102,21 +102,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusds_512(<16 x ret { <16 x i32>, <16 x i32> } %res3 } -declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) -declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) -define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { -; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512: +define <16 x i32>@test_int_x86_avx512_vpdpwssd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpdpwssd: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) ret <16 x i32> %res } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512: ; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512: +; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] @@ -125,7 +143,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512: +; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512: ; X64: # %bb.0: ; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] @@ -141,21 +159,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ret { <16 x i32>, <16 x i32> } %res3 } -declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) -declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) ret <16 x i32> %res } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512: ; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512: +; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] @@ -164,7 +200,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512: +; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512: ; X64: # %bb.0: ; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll index 60d0298..e97b8a5 100644 --- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll @@ -86,18 +86,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ret { <16 x i32>, <16 x i32> } %res2 } -declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <32 x i16>, <32 x i16>) -define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) ret <16 x i32> %1 } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] @@ -116,11 +116,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; X64-NEXT: vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda] ; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %x2 = load <32 x i16>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4) %5 = bitcast i16 %x3 to <16 x i1> %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 @@ -128,18 +128,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ret { <16 x i32>, <16 x i32> } %res2 } -declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <32 x i16>, <32 x i16>) -define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_ask_vpdpwssds_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) ret <16 x i32> %1 } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] @@ -158,11 +158,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; X64-NEXT: vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda] ; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %x2 = load <32 x i16>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4) %5 = bitcast i16 %x3 to <16 x i1> %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll index 0f4a4f2..f359ece 100644 --- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll @@ -45,3 +45,47 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) ret <8 x i32> %res } + +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx_vpdpwssd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx_vpdpwssd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx_vpdpwssds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx_vpdpwssds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll index de8b2a4..5748a42 100644 --- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll @@ -68,9 +68,9 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <1 ret <4 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2] @@ -80,13 +80,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>) -define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2] @@ -96,13 +96,13 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) ret <4 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2] @@ -112,13 +112,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>) -define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2] @@ -128,6 +128,6 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) ret <4 x i32> %res } diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll new file mode 100644 index 0000000..abdc296 --- /dev/null +++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10 + +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll index abdc296..7576b12 100644 --- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10 -define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2] @@ -14,12 +14,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2] @@ -29,12 +29,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2] @@ -44,12 +44,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2] @@ -59,12 +59,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2] @@ -74,12 +74,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2] @@ -89,12 +89,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2] @@ -104,12 +104,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2] @@ -119,12 +119,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2] @@ -134,12 +134,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2] @@ -149,12 +149,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2] @@ -164,12 +164,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2] @@ -179,7 +179,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) diff --git a/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll new file mode 100644 index 0000000..293b48d --- /dev/null +++ b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll @@ -0,0 +1,39 @@ +;; BB section test with basic block hashes. + +;; basic block sections Profile with bb hashes +; RUN: echo 'v1' > %t +; RUN: echo 'f foo' >> %t +; RUN: echo 'g 0:10,1:9,2:1 1:8,3:8 2:2,3:2 3:11' >> %t +; RUN: echo 'c 0 2 3' >> %t +; RUN: echo 'h 0:64863A11B5CA0000 1:54F1E80D6B270006 2:54F1F4E66B270008 3:C8BC6041A2CB0009' >> %t +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t | FileCheck %s +; +define void @foo(i1 zeroext) nounwind { + %2 = alloca i8, align 1 + %3 = zext i1 %0 to i8 + store i8 %3, ptr %2, align 1 + %4 = load i8, ptr %2, align 1 + %5 = trunc i8 %4 to i1 + br i1 %5, label %6, label %8 + +6: ; preds = %1 + %7 = call i32 @bar() + br label %10 + +8: ; preds = %1 + %9 = call i32 @baz() + br label %10 + +10: ; preds = %8, %6 + ret void +} + +declare i32 @bar() #1 + +declare i32 @baz() #1 + +; CHECK: .section .text.foo,"ax",@progbits +; CHECK: callq baz +; CHECK: retq +; CHECK: .section .text.split.foo,"ax",@progbits +; CHECK: callq bar diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll new file mode 100644 index 0000000..6fe7bf5 --- /dev/null +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll @@ -0,0 +1,93 @@ +; BB cluster section tests when using edges profile and basic blocks hashes to generate clusters. +; In the tests, we first generate hash values for basic blocks and write them to the profile. +; When generating basic blocks clusters, we match the hashes of basic blocks in the current CFG +; with those in the profile. After a successful match, we retrieve the weights of the basic blocks +; and edges from the profile. Subsequently, we use an inference algorithm to deduce the complete +; weights of all basic blocks and edges. Finally, we generate "hot" and "cold" clusters based on +; these complete weights. +; In Test 1 and Test 2, the weights of basic blocks and edges in the profiles are different, which +; will ultimately result in distinct cluster partitioning outcomes. +; +; RUN: llc %s -O0 -mtriple=x86_64-pc-linux -function-sections -filetype=obj -basic-block-address-map -emit-bb-hash -o %t.o +; +; Test1: Basic blocks #0 (entry), #1 and #3 will be placed in the same section. +; The rest will be placed in the cold section. +; +; RUN: echo 'v1' > %t1 +; RUN: echo 'f foo' >> %t1 +; RUN: echo 'g 0:100,1:100,2:0 1:100,3:100 2:0,3:0 3:100' >> %t1 +; +; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP +; and put them into the basic blocks sections profile. +; RUN: llvm-readobj %t.o --bb-addr-map | \ +; RUN: awk 'BEGIN {printf "h"} \ +; RUN: /ID: [0-9]+/ {id=$2} \ +; RUN: /Hash: 0x[0-9A-Fa-f]+/ {gsub(/^0x/, "", $2); hash=$2; printf " %%s:%%s", id, hash} \ +; RUN: END {print ""}' \ +; RUN: >> %t1 +; +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -basic-block-section-match-infer | \ +; RUN: FileCheck %s -check-prefixes=CHECK,LINUX-SECTIONS1 +; +; Test2: Basic #0 (entry), #2 and #3 will be placed in the same section. +; The rest will be placed in the cold section. +; +; RUN: echo 'v1' > %t2 +; RUN: echo 'f foo' >> %t2 +; RUN: echo 'g 0:100,1:0,2:100 1:0,3:0 2:100,3:100 3:100' >> %t2 +; +; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP +; and put them into the basic blocks sections profile. +; RUN: llvm-readobj %t.o --bb-addr-map | \ +; RUN: awk 'BEGIN {printf "h"} \ +; RUN: /ID: [0-9]+/ {id=$2} \ +; RUN: /Hash: 0x[0-9A-Fa-f]+/ {gsub(/^0x/, "", $2); hash=$2; printf " %%s:%%s", id, hash} \ +; RUN: END {print ""}' \ +; RUN: >> %t2 +; +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -basic-block-section-match-infer | \ +; RUN: FileCheck %s -check-prefixes=CHECK,LINUX-SECTIONS2 + +define void @foo(i1 zeroext) nounwind { + %2 = alloca i8, align 1 + %3 = zext i1 %0 to i8 + store i8 %3, ptr %2, align 1 + %4 = load i8, ptr %2, align 1 + %5 = trunc i8 %4 to i1 + br i1 %5, label %6, label %8 + +6: ; preds = %1 + %7 = call i32 @bar() + br label %10 + +8: ; preds = %1 + %9 = call i32 @baz() + br label %10 + +10: ; preds = %8, %6 + ret void +} + +declare i32 @bar() #1 + +declare i32 @baz() #1 + +; CHECK: .section .text.foo,"ax",@progbits +; CHECK-NOT: .section +; CHECK-LABEL: foo: +; CHECK-NOT: .section +; CHECK-NOT: .LBB_END0_{{0-9}}+ +; LINUX-SECTIONS1-LABEL: # %bb.1: +; LINUX-SECTIONS2-LABEL: # %bb.2: +; CHECK-NOT: .section +; CHECK-NOT: .LBB_END0_{{0-9}}+ +; CHECK-LABEL: .LBB0_3: +; CHECK-LABEL: .LBB_END0_3: +; CHECK-NEXT: .section .text.split.foo,"ax",@progbits +; CHECK-LABEL: foo.cold: +; LINUX-SECTIONS1-LABEL: .LBB_END0_2: +; LINUX-SECTIONS2-LABEL: .LBB_END0_1: +; LINUX-SECTIONS1-LABEL: .size foo.cold, .LBB_END0_2-foo.cold +; LINUX-SECTIONS2-LABEL: .size foo.cold, .LBB_END0_1-foo.cold +; CHECK-LABEL: .Lfunc_end0: +; CHECK-NEXT: .size foo, .Lfunc_end0-foo diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll index 751ab76..eb0a14b 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll @@ -69,6 +69,20 @@ ; RUN: echo 'g 0:4,1:2:3' >> %t15 ; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t15 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR15 ; CHECK-ERROR15: LLVM ERROR: invalid profile {{.*}} at line 4: unsigned integer expected: '2:3' +; RUN: echo 'v1' > %t16 +; RUN: echo 'f dummy1' >> %t16 +; RUN: echo 'c 0 1' >> %t16 +; RUN: echo 'g 0:4,1:2' >> %t16 +; RUN: echo 'h a:1111111111111111 1:ffffffffffffffff' >> %t16 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t16 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR16 +; CHECK-ERROR16: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected: 'a' +; RUN: echo 'v1' > %t17 +; RUN: echo 'f dummy1' >> %t17 +; RUN: echo 'c 0 1' >> %t17 +; RUN: echo 'g 0:4,1:2' >> %t17 +; RUN: echo 'h 0:111111111111111g 1:ffffffffffffffff' >> %t17 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t17 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR17 +; CHECK-ERROR17: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected in hex format: '111111111111111g' define i32 @dummy1(i32 %x, i32 %y, i32 %z) { diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll index 45ef452..d171821 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll @@ -1,17 +1,13 @@ -;; Check the basic block sections list option. -;; version 0 profile: -; RUN: echo '!_Z3foob' > %t1 +;; Check that specifying the function in the basic block sections profile +;; without any other directives is a noop. ;; -;; version 1 profile: -; RUN: echo 'v1' > %t2 -; RUN: echo 'f _Z3foob' >> %t2 +;; Specify the bb sections profile: +; RUN: echo 'v1' > %t +; RUN: echo 'f _Z3foob' >> %t ;; -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t > %t.bbsections +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %t.orig +; RUN: diff -u %t.orig %t.bbsections define i32 @_Z3foob(i1 zeroext %0) nounwind { %2 = alloca i32, align 4 @@ -41,45 +37,3 @@ define i32 @_Z3foob(i1 zeroext %0) nounwind { declare i32 @_Z3barv() #1 declare i32 @_Z3bazv() #1 - -define i32 @_Z3zipb(i1 zeroext %0) nounwind { - %2 = alloca i32, align 4 - %3 = alloca i8, align 1 - %4 = zext i1 %0 to i8 - store i8 %4, ptr %3, align 1 - %5 = load i8, ptr %3, align 1 - %6 = trunc i8 %5 to i1 - %7 = zext i1 %6 to i32 - %8 = icmp sgt i32 %7, 0 - br i1 %8, label %9, label %11 - -9: ; preds = %1 - %10 = call i32 @_Z3barv() - store i32 %10, ptr %2, align 4 - br label %13 - -11: ; preds = %1 - %12 = call i32 @_Z3bazv() - store i32 %12, ptr %2, align 4 - br label %13 - -13: ; preds = %11, %9 - %14 = load i32, ptr %2, align 4 - ret i32 %14 -} - -; LINUX-SECTIONS-NO-GUIDED-PREFIX: .section .text._Z3foob,"ax",@progbits -; LINUX-SECTIONS: .section .text.hot._Z3foob,"ax",@progbits -; LINUX-SECTIONS: _Z3foob: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.1,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.1: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.2,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.2: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.3,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.3: - -; LINUX-SECTIONS-FUNCTION-SECTION: .section .text._Z3zipb,"ax",@progbits -; LINUX-SECTIONS-NO-FUNCTION-SECTION-NOT: .section .text{{.*}}._Z3zipb,"ax",@progbits -; LINUX-SECTIONS: _Z3zipb: -; LINUX-SECTIONS-NOT: .section .text{{.*}}._Z3zipb.__part.{{[0-9]+}},"ax",@progbits -; LINUX-SECTIONS-NOT: _Z3zipb.__part.{{[0-9]+}}: diff --git a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll index d481b14..6e0db20 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll @@ -1,6 +1,8 @@ -; RUN: echo "!foo" > %t.order.txt -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt | FileCheck --check-prefix=SOURCE-DRIFT %s -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s +; RUN: echo "v1" > %t +; RUN: echo "f foo" >> %t +; RUN: echo "c 0" >> %t +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t | FileCheck --check-prefix=SOURCE-DRIFT %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s define dso_local i32 @foo(i1 zeroext %0, i1 zeroext %1) !annotation !1 { br i1 %0, label %5, label %3 diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 684e292..7bccd6b 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,BF16,AVXNC +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,SSE2 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512,AVX512BF16 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512,AVX512FP16 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=X64,AVX,AVXNC define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind { ; X86-LABEL: add: @@ -39,18 +39,18 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind { ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; F16-LABEL: add: -; F16: # %bb.0: -; F16-NEXT: movzwl (%rsi), %eax -; F16-NEXT: shll $16, %eax -; F16-NEXT: vmovd %eax, %xmm0 -; F16-NEXT: movzwl (%rdi), %eax -; F16-NEXT: shll $16, %eax -; F16-NEXT: vmovd %eax, %xmm1 -; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; F16-NEXT: vpextrw $0, %xmm0, (%rdx) -; F16-NEXT: retq +; AVX512-LABEL: add: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl (%rsi), %eax +; AVX512-NEXT: shll $16, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: shll $16, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512-NEXT: vpextrw $0, %xmm0, (%rdx) +; AVX512-NEXT: retq ; ; AVXNC-LABEL: add: ; AVXNC: # %bb.0: @@ -98,17 +98,29 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind { ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; FP16-LABEL: add2: -; FP16: # %bb.0: -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: vmovw %xmm1, %ecx -; FP16-NEXT: shll $16, %ecx -; FP16-NEXT: vmovd %ecx, %xmm0 -; FP16-NEXT: shll $16, %eax -; FP16-NEXT: vmovd %eax, %xmm1 -; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; FP16-NEXT: retq +; AVX512BF16-LABEL: add2: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; AVX512BF16-NEXT: vpextrw $0, %xmm1, %ecx +; AVX512BF16-NEXT: shll $16, %ecx +; AVX512BF16-NEXT: vmovd %ecx, %xmm0 +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm1 +; AVX512BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: add2: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vmovw %xmm0, %eax +; AVX512FP16-NEXT: vmovw %xmm1, %ecx +; AVX512FP16-NEXT: shll $16, %ecx +; AVX512FP16-NEXT: vmovd %ecx, %xmm0 +; AVX512FP16-NEXT: shll $16, %eax +; AVX512FP16-NEXT: vmovd %eax, %xmm1 +; AVX512FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: add2: ; AVXNC: # %bb.0: @@ -189,34 +201,63 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; FP16-LABEL: add_double: -; FP16: # %bb.0: -; FP16-NEXT: pushq %rbp -; FP16-NEXT: pushq %r14 -; FP16-NEXT: pushq %rbx -; FP16-NEXT: movq %rdx, %rbx -; FP16-NEXT: movq %rsi, %r14 -; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovw %xmm0, %ebp -; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: shll $16, %eax -; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: shll $16, %ebp -; FP16-NEXT: vmovd %ebp, %xmm1 -; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: shll $16, %eax -; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; FP16-NEXT: vmovsd %xmm0, (%rbx) -; FP16-NEXT: popq %rbx -; FP16-NEXT: popq %r14 -; FP16-NEXT: popq %rbp -; FP16-NEXT: retq +; AVX512BF16-LABEL: add_double: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rbp +; AVX512BF16-NEXT: pushq %r14 +; AVX512BF16-NEXT: pushq %rbx +; AVX512BF16-NEXT: movq %rdx, %rbx +; AVX512BF16-NEXT: movq %rsi, %r14 +; AVX512BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %ebp +; AVX512BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm0 +; AVX512BF16-NEXT: shll $16, %ebp +; AVX512BF16-NEXT: vmovd %ebp, %xmm1 +; AVX512BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512BF16-NEXT: vmovd %xmm0, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm0 +; AVX512BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512BF16-NEXT: vmovsd %xmm0, (%rbx) +; AVX512BF16-NEXT: popq %rbx +; AVX512BF16-NEXT: popq %r14 +; AVX512BF16-NEXT: popq %rbp +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: add_double: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: pushq %rbp +; AVX512FP16-NEXT: pushq %r14 +; AVX512FP16-NEXT: pushq %rbx +; AVX512FP16-NEXT: movq %rdx, %rbx +; AVX512FP16-NEXT: movq %rsi, %r14 +; AVX512FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovw %xmm0, %ebp +; AVX512FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovw %xmm0, %eax +; AVX512FP16-NEXT: shll $16, %eax +; AVX512FP16-NEXT: vmovd %eax, %xmm0 +; AVX512FP16-NEXT: shll $16, %ebp +; AVX512FP16-NEXT: vmovd %ebp, %xmm1 +; AVX512FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512FP16-NEXT: vmovw %xmm0, %eax +; AVX512FP16-NEXT: shll $16, %eax +; AVX512FP16-NEXT: vmovd %eax, %xmm0 +; AVX512FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512FP16-NEXT: vmovsd %xmm0, (%rbx) +; AVX512FP16-NEXT: popq %rbx +; AVX512FP16-NEXT: popq %r14 +; AVX512FP16-NEXT: popq %rbp +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: add_double: ; AVXNC: # %bb.0: @@ -310,30 +351,55 @@ define double @add_double2(double %da, double %db) nounwind { ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; FP16-LABEL: add_double2: -; FP16: # %bb.0: -; FP16-NEXT: pushq %rbx -; FP16-NEXT: subq $16, %rsp -; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovw %xmm0, %ebx -; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload -; FP16-NEXT: # xmm0 = mem[0],zero -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: shll $16, %eax -; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: shll $16, %ebx -; FP16-NEXT: vmovd %ebx, %xmm1 -; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: shll $16, %eax -; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; FP16-NEXT: addq $16, %rsp -; FP16-NEXT: popq %rbx -; FP16-NEXT: retq +; AVX512BF16-LABEL: add_double2: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rbx +; AVX512BF16-NEXT: subq $16, %rsp +; AVX512BF16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %ebx +; AVX512BF16-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload +; AVX512BF16-NEXT: # xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm0 +; AVX512BF16-NEXT: shll $16, %ebx +; AVX512BF16-NEXT: vmovd %ebx, %xmm1 +; AVX512BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512BF16-NEXT: vmovd %xmm0, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm0 +; AVX512BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512BF16-NEXT: addq $16, %rsp +; AVX512BF16-NEXT: popq %rbx +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: add_double2: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: pushq %rbx +; AVX512FP16-NEXT: subq $16, %rsp +; AVX512FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovw %xmm0, %ebx +; AVX512FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; AVX512FP16-NEXT: # xmm0 = mem[0],zero +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovw %xmm0, %eax +; AVX512FP16-NEXT: shll $16, %eax +; AVX512FP16-NEXT: vmovd %eax, %xmm0 +; AVX512FP16-NEXT: shll $16, %ebx +; AVX512FP16-NEXT: vmovd %ebx, %xmm1 +; AVX512FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512FP16-NEXT: vmovw %xmm0, %eax +; AVX512FP16-NEXT: shll $16, %eax +; AVX512FP16-NEXT: vmovd %eax, %xmm0 +; AVX512FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512FP16-NEXT: addq $16, %rsp +; AVX512FP16-NEXT: popq %rbx +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: add_double2: ; AVXNC: # %bb.0: @@ -393,15 +459,15 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind { ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; F16-LABEL: add_constant: -; F16: # %bb.0: -; F16-NEXT: movzwl (%rdi), %eax -; F16-NEXT: shll $16, %eax -; F16-NEXT: vmovd %eax, %xmm0 -; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; F16-NEXT: vpextrw $0, %xmm0, (%rsi) -; F16-NEXT: retq +; AVX512-LABEL: add_constant: +; AVX512: # %bb.0: +; AVX512-NEXT: movzwl (%rdi), %eax +; AVX512-NEXT: shll $16, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX512-NEXT: retq ; ; AVXNC-LABEL: add_constant: ; AVXNC: # %bb.0: @@ -439,14 +505,23 @@ define bfloat @add_constant2(bfloat %a) nounwind { ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; FP16-LABEL: add_constant2: -; FP16: # %bb.0: -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: shll $16, %eax -; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; FP16-NEXT: retq +; AVX512BF16-LABEL: add_constant2: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm0 +; AVX512BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: add_constant2: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vmovw %xmm0, %eax +; AVX512FP16-NEXT: shll $16, %eax +; AVX512FP16-NEXT: vmovd %eax, %xmm0 +; AVX512FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: add_constant2: ; AVXNC: # %bb.0: @@ -467,10 +542,10 @@ define void @store_constant(ptr %pc) nounwind { ; X86-NEXT: movw $16256, (%eax) # imm = 0x3F80 ; X86-NEXT: retl ; -; CHECK-LABEL: store_constant: -; CHECK: # %bb.0: -; CHECK-NEXT: movw $16256, (%rdi) # imm = 0x3F80 -; CHECK-NEXT: retq +; X64-LABEL: store_constant: +; X64: # %bb.0: +; X64-NEXT: movw $16256, (%rdi) # imm = 0x3F80 +; X64-NEXT: retq store bfloat 1.0, ptr %pc ret void } @@ -484,11 +559,11 @@ define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind { ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: retl ; -; CHECK-LABEL: fold_ext_trunc: -; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movw %ax, (%rsi) -; CHECK-NEXT: retq +; X64-LABEL: fold_ext_trunc: +; X64: # %bb.0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movw %ax, (%rsi) +; X64-NEXT: retq %a = load bfloat, ptr %pa %ext = fpext bfloat %a to float %trunc = fptrunc float %ext to bfloat @@ -502,9 +577,9 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind { ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: retl ; -; CHECK-LABEL: fold_ext_trunc2: -; CHECK: # %bb.0: -; CHECK-NEXT: retq +; X64-LABEL: fold_ext_trunc2: +; X64: # %bb.0: +; X64-NEXT: retq %ext = fpext bfloat %a to float %trunc = fptrunc float %ext to bfloat ret bfloat %trunc @@ -526,11 +601,17 @@ define bfloat @fold_from_half(half %a) nounwind { ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; FP16-LABEL: fold_from_half: -; FP16: # %bb.0: -; FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; FP16-NEXT: retq +; AVX512BF16-LABEL: fold_from_half: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: fold_from_half: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: fold_from_half: ; AVXNC: # %bb.0: @@ -561,21 +642,29 @@ define half @fold_to_half(bfloat %a) nounwind { ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; BF16-LABEL: fold_to_half: -; BF16: # %bb.0: -; BF16-NEXT: vpextrw $0, %xmm0, %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BF16-NEXT: retq -; -; FP16-LABEL: fold_to_half: -; FP16: # %bb.0: -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: shll $16, %eax -; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 -; FP16-NEXT: retq +; AVX512BF16-LABEL: fold_to_half: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm0 +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: fold_to_half: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vmovw %xmm0, %eax +; AVX512FP16-NEXT: shll $16, %eax +; AVX512FP16-NEXT: vmovd %eax, %xmm0 +; AVX512FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 +; AVX512FP16-NEXT: retq +; +; AVXNC-LABEL: fold_to_half: +; AVXNC: # %bb.0: +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVXNC-NEXT: retq %ext = fpext bfloat %a to float %trunc = fptrunc float %ext to half ret half %trunc @@ -587,9 +676,9 @@ define bfloat @bitcast_from_half(half %a) nounwind { ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: retl ; -; CHECK-LABEL: bitcast_from_half: -; CHECK: # %bb.0: -; CHECK-NEXT: retq +; X64-LABEL: bitcast_from_half: +; X64: # %bb.0: +; X64-NEXT: retq %bc = bitcast half %a to bfloat ret bfloat %bc } @@ -600,9 +689,9 @@ define half @bitcast_to_half(bfloat %a) nounwind { ; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: retl ; -; CHECK-LABEL: bitcast_to_half: -; CHECK: # %bb.0: -; CHECK-NEXT: retq +; X64-LABEL: bitcast_to_half: +; X64: # %bb.0: +; X64-NEXT: retq %bc = bitcast bfloat %a to half ret half %bc } @@ -753,16 +842,16 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; F16-LABEL: addv: -; F16: # %bb.0: -; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; F16-NEXT: vpslld $16, %ymm1, %ymm1 -; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; F16-NEXT: vpslld $16, %ymm0, %ymm0 -; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0 -; F16-NEXT: vzeroupper -; F16-NEXT: retq +; AVX512-LABEL: addv: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vpslld $16, %ymm1, %ymm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpslld $16, %ymm0, %ymm0 +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vcvtneps2bf16 %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVXNC-LABEL: addv: ; AVXNC: # %bb.0: @@ -791,16 +880,22 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) { ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: retq ; -; BF16-LABEL: pr62997: -; BF16: # %bb.0: -; BF16-NEXT: vpextrw $0, %xmm1, %eax -; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; BF16-NEXT: retq +; AVX512BF16-LABEL: pr62997: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: vpextrw $0, %xmm1, %eax +; AVX512BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX512BF16-NEXT: retq ; -; FP16-LABEL: pr62997: -; FP16: # %bb.0: -; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; FP16-NEXT: retq +; AVX512FP16-LABEL: pr62997: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512FP16-NEXT: retq +; +; AVXNC-LABEL: pr62997: +; AVXNC: # %bb.0: +; AVXNC-NEXT: vpextrw $0, %xmm1, %eax +; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVXNC-NEXT: retq %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0 %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1 ret <2 x bfloat> %2 @@ -820,10 +915,10 @@ define <32 x bfloat> @pr63017() { ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: retq ; -; F16-LABEL: pr63017: -; F16: # %bb.0: -; F16-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; F16-NEXT: retq +; AVX512-LABEL: pr63017: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq ; ; AVXNC-LABEL: pr63017: ; AVXNC: # %bb.0: @@ -1077,11 +1172,17 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: retq ; -; FP16-LABEL: pr63017_2: -; FP16: # %bb.0: -; FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0] -; FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} -; FP16-NEXT: retq +; AVX512BF16-LABEL: pr63017_2: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] +; AVX512BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: pr63017_2: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0] +; AVX512FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: pr63017_2: ; AVXNC: # %bb.0: @@ -1118,12 +1219,19 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) { ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE2-NEXT: retq ; -; FP16-LABEL: pr62997_3: -; FP16: # %bb.0: -; FP16-NEXT: vmovw %xmm1, %eax -; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 -; FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; FP16-NEXT: retq +; AVX512BF16-LABEL: pr62997_3: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: vpextrw $0, %xmm1, %eax +; AVX512BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; AVX512BF16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: pr62997_3: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vmovw %xmm1, %eax +; AVX512FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; AVX512FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: pr62997_3: ; AVXNC: # %bb.0: @@ -1206,11 +1314,11 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) { ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; -; F16-LABEL: pr64460_3: -; F16: # %bb.0: -; F16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; F16-NEXT: vpslld $16, %zmm0, %zmm0 -; F16-NEXT: retq +; AVX512-LABEL: pr64460_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512-NEXT: vpslld $16, %zmm0, %zmm0 +; AVX512-NEXT: retq ; ; AVXNC-LABEL: pr64460_3: ; AVXNC: # %bb.0: @@ -1248,12 +1356,12 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) { ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: retq ; -; F16-LABEL: pr64460_4: -; F16: # %bb.0: -; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; F16-NEXT: vpslld $16, %ymm0, %ymm0 -; F16-NEXT: vcvtps2pd %ymm0, %zmm0 -; F16-NEXT: retq +; AVX512-LABEL: pr64460_4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpslld $16, %ymm0, %ymm0 +; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 +; AVX512-NEXT: retq ; ; AVXNC-LABEL: pr64460_4: ; AVXNC: # %bb.0: @@ -1301,12 +1409,12 @@ define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind { ; SSE2-NEXT: addq $72, %rsp ; SSE2-NEXT: retq ; -; F16-LABEL: fptrunc_v4f32: -; F16: # %bb.0: -; F16-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0 -; F16-NEXT: vzeroupper -; F16-NEXT: retq +; AVX512-LABEL: fptrunc_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vcvtneps2bf16 %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVXNC-LABEL: fptrunc_v4f32: ; AVXNC: # %bb.0: @@ -1387,11 +1495,11 @@ define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; F16-LABEL: fptrunc_v8f32: -; F16: # %bb.0: -; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0 -; F16-NEXT: vzeroupper -; F16-NEXT: retq +; AVX512-LABEL: fptrunc_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtneps2bf16 %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVXNC-LABEL: fptrunc_v8f32: ; AVXNC: # %bb.0: @@ -1526,10 +1634,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; F16-LABEL: fptrunc_v16f32: -; F16: # %bb.0: -; F16-NEXT: vcvtneps2bf16 %zmm0, %ymm0 -; F16-NEXT: retq +; AVX512-LABEL: fptrunc_v16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtneps2bf16 %zmm0, %ymm0 +; AVX512-NEXT: retq ; ; AVXNC-LABEL: fptrunc_v16f32: ; AVXNC: # %bb.0: @@ -1666,63 +1774,138 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; FP16-LABEL: fptrunc_v8f64: -; FP16: # %bb.0: -; FP16-NEXT: subq $184, %rsp -; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0 -; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; FP16-NEXT: vzeroupper -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; FP16-NEXT: # xmm0 = mem[1,0] -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; FP16-NEXT: vzeroupper -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; FP16-NEXT: vzeroupper -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; FP16-NEXT: vzeroupper -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; FP16-NEXT: callq __truncdfbf2@PLT -; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; FP16-NEXT: addq $184, %rsp -; FP16-NEXT: retq +; AVX512BF16-LABEL: fptrunc_v8f64: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rbp +; AVX512BF16-NEXT: pushq %r15 +; AVX512BF16-NEXT: pushq %r14 +; AVX512BF16-NEXT: pushq %r13 +; AVX512BF16-NEXT: pushq %r12 +; AVX512BF16-NEXT: pushq %rbx +; AVX512BF16-NEXT: subq $184, %rsp +; AVX512BF16-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BF16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BF16-NEXT: vzeroupper +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BF16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512BF16-NEXT: # xmm0 = mem[1,0] +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BF16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BF16-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BF16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512BF16-NEXT: vzeroupper +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BF16-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512BF16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512BF16-NEXT: vzeroupper +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BF16-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512BF16-NEXT: # xmm0 = mem[1,0] +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BF16-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BF16-NEXT: vzeroupper +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BF16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512BF16-NEXT: # xmm0 = mem[1,0] +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %ebx +; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %ebp +; AVX512BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %r14d +; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %r15d +; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %r12d +; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %r13d +; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BF16-NEXT: callq __truncdfbf2@PLT +; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BF16-NEXT: vpinsrw $1, %r13d, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0 +; AVX512BF16-NEXT: addq $184, %rsp +; AVX512BF16-NEXT: popq %rbx +; AVX512BF16-NEXT: popq %r12 +; AVX512BF16-NEXT: popq %r13 +; AVX512BF16-NEXT: popq %r14 +; AVX512BF16-NEXT: popq %r15 +; AVX512BF16-NEXT: popq %rbp +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: fptrunc_v8f64: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: subq $184, %rsp +; AVX512FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512FP16-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512FP16-NEXT: vzeroupper +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512FP16-NEXT: # xmm0 = mem[1,0] +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512FP16-NEXT: vzeroupper +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512FP16-NEXT: vzeroupper +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512FP16-NEXT: vzeroupper +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512FP16-NEXT: callq __truncdfbf2@PLT +; AVX512FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512FP16-NEXT: addq $184, %rsp +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: fptrunc_v8f64: ; AVXNC: # %bb.0: @@ -1817,10 +2000,10 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) { ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: retq ; -; F16-LABEL: test_v8bf16_v32bf16: -; F16: # %bb.0: -; F16-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; F16-NEXT: retq +; AVX512-LABEL: test_v8bf16_v32bf16: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: retq ; ; AVXNC-LABEL: test_v8bf16_v32bf16: ; AVXNC: # %bb.0: @@ -1959,13 +2142,21 @@ define float @trunc_ext(float %a) nounwind { ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; FP16-LABEL: trunc_ext: -; FP16: # %bb.0: -; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 -; FP16-NEXT: vmovw %xmm0, %eax -; FP16-NEXT: shll $16, %eax -; FP16-NEXT: vmovd %eax, %xmm0 -; FP16-NEXT: retq +; AVX512BF16-LABEL: trunc_ext: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512BF16-NEXT: vmovd %xmm0, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm0 +; AVX512BF16-NEXT: retq +; +; AVX512FP16-LABEL: trunc_ext: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; AVX512FP16-NEXT: vmovw %xmm0, %eax +; AVX512FP16-NEXT: shll $16, %eax +; AVX512FP16-NEXT: vmovd %eax, %xmm0 +; AVX512FP16-NEXT: retq ; ; AVXNC-LABEL: trunc_ext: ; AVXNC: # %bb.0: @@ -2042,14 +2233,14 @@ define bfloat @PR108936(x86_fp80 %0) nounwind { ; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; -; CHECK-LABEL: PR108936: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $24, %rsp -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt (%rsp) -; CHECK-NEXT: callq __truncxfbf2@PLT -; CHECK-NEXT: addq $24, %rsp -; CHECK-NEXT: retq +; X64-LABEL: PR108936: +; X64: # %bb.0: +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: callq __truncxfbf2@PLT +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq %2 = fptrunc x86_fp80 %0 to bfloat ret bfloat %2 } @@ -2064,12 +2255,12 @@ define bfloat @PR115710(fp128 %0) nounwind { ; X86-NEXT: addl $28, %esp ; X86-NEXT: retl ; -; CHECK-LABEL: PR115710: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: callq __trunctfbf2@PLT -; CHECK-NEXT: popq %rax -; CHECK-NEXT: retq +; X64-LABEL: PR115710: +; X64: # %bb.0: +; X64-NEXT: pushq %rax +; X64-NEXT: callq __trunctfbf2@PLT +; X64-NEXT: popq %rax +; X64-NEXT: retq %2 = fptrunc fp128 %0 to bfloat ret bfloat %2 } diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 86d7df0c..fae1ff9 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -216,8 +216,8 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; SSE-LABEL: bitcast_v16i8_to_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -225,8 +225,8 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; ; AVX12-LABEL: bitcast_v16i8_to_v2i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpmovmskb %xmm0, %ecx -; AVX12-NEXT: movl %ecx, %eax +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: movl %eax, %ecx ; AVX12-NEXT: shrl $8, %eax ; AVX12-NEXT: addb %cl, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax @@ -441,8 +441,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: bitcast_v16i16_to_v2i8: ; SSE: # %bb.0: ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -452,8 +452,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -464,8 +464,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -762,8 +762,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -776,8 +776,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -793,8 +793,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll index 13149d7..749b3dd 100644 --- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll +++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll @@ -1,8 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX512,AVX512POPCNT ; ; CTPOP @@ -16,6 +17,14 @@ define i32 @test_ctpop_i128(i128 %a0) nounwind { ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_ctpop_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: popcntq %rsi, %rcx +; AVX512-NEXT: popcntq %rdi, %rax +; AVX512-NEXT: addl %ecx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq %cnt = call i128 @llvm.ctpop.i128(i128 %a0) %res = trunc i128 %cnt to i32 ret i32 %res @@ -29,12 +38,77 @@ define i32 @load_ctpop_i128(ptr %p0) nounwind { ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq +; +; AVX512-LABEL: load_ctpop_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: popcntq 8(%rdi), %rcx +; AVX512-NEXT: popcntq (%rdi), %rax +; AVX512-NEXT: addl %ecx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq %a0 = load i128, ptr %p0 %cnt = call i128 @llvm.ctpop.i128(i128 %a0) %res = trunc i128 %cnt to i32 ret i32 %res } +define i32 @vector_ctpop_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctpop_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: popcntq %rcx, %rcx +; SSE-NEXT: popcntq %rax, %rax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctpop_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: popcntq %rax, %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: popcntq %rcx, %rax +; AVX2-NEXT: addl %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctpop_i128: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: popcntq %rax, %rdx +; AVX512F-NEXT: popcntq %rcx, %rax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctpop_i128: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: popcntq %rcx, %rcx +; AVX512VL-NEXT: popcntq %rax, %rax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctpop_i128: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: popcntq %rcx, %rcx +; AVX512POPCNT-NEXT: popcntq %rax, %rax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.ctpop.i128(i128 %a0) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + define i32 @test_ctpop_i256(i256 %a0) nounwind { ; CHECK-LABEL: test_ctpop_i256: ; CHECK: # %bb.0: @@ -50,6 +124,48 @@ define i32 @test_ctpop_i256(i256 %a0) nounwind { ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq +; +; AVX512F-LABEL: test_ctpop_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: popcntq %rcx, %rax +; AVX512F-NEXT: popcntq %rdx, %rcx +; AVX512F-NEXT: addl %eax, %ecx +; AVX512F-NEXT: popcntq %rsi, %rdx +; AVX512F-NEXT: popcntq %rdi, %rax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_ctpop_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: popcntq %rcx, %rax +; AVX512VL-NEXT: xorl %ecx, %ecx +; AVX512VL-NEXT: popcntq %rdx, %rcx +; AVX512VL-NEXT: addl %eax, %ecx +; AVX512VL-NEXT: xorl %edx, %edx +; AVX512VL-NEXT: popcntq %rsi, %rdx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %rdi, %rax +; AVX512VL-NEXT: addl %edx, %eax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_ctpop_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: popcntq %rcx, %rax +; AVX512POPCNT-NEXT: xorl %ecx, %ecx +; AVX512POPCNT-NEXT: popcntq %rdx, %rcx +; AVX512POPCNT-NEXT: addl %eax, %ecx +; AVX512POPCNT-NEXT: xorl %edx, %edx +; AVX512POPCNT-NEXT: popcntq %rsi, %rdx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %rdi, %rax +; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq %cnt = call i256 @llvm.ctpop.i256(i256 %a0) %res = trunc i256 %cnt to i32 ret i32 %res @@ -81,24 +197,150 @@ define i32 @load_ctpop_i256(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_ctpop_i256: -; AVX512: # %bb.0: -; AVX512-NEXT: popcntq 24(%rdi), %rax -; AVX512-NEXT: popcntq 16(%rdi), %rcx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: popcntq 8(%rdi), %rdx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq (%rdi), %rax -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: addl %ecx, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; AVX512F-LABEL: load_ctpop_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: popcntq 24(%rdi), %rax +; AVX512F-NEXT: popcntq 16(%rdi), %rcx +; AVX512F-NEXT: addl %eax, %ecx +; AVX512F-NEXT: popcntq 8(%rdi), %rdx +; AVX512F-NEXT: popcntq (%rdi), %rax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctpop_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: popcntq 24(%rdi), %rax +; AVX512VL-NEXT: popcntq 16(%rdi), %rcx +; AVX512VL-NEXT: addl %eax, %ecx +; AVX512VL-NEXT: popcntq 8(%rdi), %rdx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq (%rdi), %rax +; AVX512VL-NEXT: addl %edx, %eax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctpop_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax +; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rcx +; AVX512POPCNT-NEXT: addl %eax, %ecx +; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rdx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq (%rdi), %rax +; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq %a0 = load i256, ptr %p0 %cnt = call i256 @llvm.ctpop.i256(i256 %a0) %res = trunc i256 %cnt to i32 ret i32 %res } +define i32 @vector_ctpop_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctpop_i256: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: movq %xmm1, %rdx +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: popcntq %rsi, %rsi +; SSE-NEXT: popcntq %rdx, %rdx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: popcntq %rax, %rsi +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: popcntq %rcx, %rax +; SSE-NEXT: addl %esi, %eax +; SSE-NEXT: addl %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctpop_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: popcntq %rdx, %rdx +; AVX2-NEXT: popcntq %rsi, %rsi +; AVX2-NEXT: addl %edx, %esi +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: popcntq %rax, %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: popcntq %rcx, %rax +; AVX2-NEXT: addl %edx, %eax +; AVX2-NEXT: addl %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctpop_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: popcntq %rdx, %rdx +; AVX512F-NEXT: popcntq %rsi, %rsi +; AVX512F-NEXT: addl %edx, %esi +; AVX512F-NEXT: popcntq %rax, %rdx +; AVX512F-NEXT: popcntq %rcx, %rax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: addl %esi, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctpop_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax +; AVX512VL-NEXT: vmovq %xmm0, %rcx +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rdx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512VL-NEXT: popcntq %rsi, %rsi +; AVX512VL-NEXT: popcntq %rdx, %rdx +; AVX512VL-NEXT: addl %esi, %edx +; AVX512VL-NEXT: xorl %esi, %esi +; AVX512VL-NEXT: popcntq %rax, %rsi +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %rcx, %rax +; AVX512VL-NEXT: addl %esi, %eax +; AVX512VL-NEXT: addl %edx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctpop_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rax +; AVX512POPCNT-NEXT: vmovq %xmm0, %rcx +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512POPCNT-NEXT: popcntq %rsi, %rsi +; AVX512POPCNT-NEXT: popcntq %rdx, %rdx +; AVX512POPCNT-NEXT: addl %esi, %edx +; AVX512POPCNT-NEXT: xorl %esi, %esi +; AVX512POPCNT-NEXT: popcntq %rax, %rsi +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %rcx, %rax +; AVX512POPCNT-NEXT: addl %esi, %eax +; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.ctpop.i256(i256 %a0) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + define i32 @test_ctpop_i512(i512 %a0) nounwind { ; CHECK-LABEL: test_ctpop_i512: ; CHECK: # %bb.0: @@ -124,6 +366,76 @@ define i32 @test_ctpop_i512(i512 %a0) nounwind { ; CHECK-NEXT: addl %r8d, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq +; +; AVX512F-LABEL: test_ctpop_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: addl %eax, %r10d +; AVX512F-NEXT: popcntq %r9, %rax +; AVX512F-NEXT: popcntq %r8, %r8 +; AVX512F-NEXT: addl %eax, %r8d +; AVX512F-NEXT: addl %r10d, %r8d +; AVX512F-NEXT: popcntq %rcx, %rax +; AVX512F-NEXT: popcntq %rdx, %rcx +; AVX512F-NEXT: addl %eax, %ecx +; AVX512F-NEXT: popcntq %rsi, %rdx +; AVX512F-NEXT: popcntq %rdi, %rax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: addl %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_ctpop_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: addl %eax, %r10d +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %r9, %rax +; AVX512VL-NEXT: popcntq %r8, %r8 +; AVX512VL-NEXT: addl %eax, %r8d +; AVX512VL-NEXT: addl %r10d, %r8d +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %rcx, %rax +; AVX512VL-NEXT: xorl %ecx, %ecx +; AVX512VL-NEXT: popcntq %rdx, %rcx +; AVX512VL-NEXT: addl %eax, %ecx +; AVX512VL-NEXT: xorl %edx, %edx +; AVX512VL-NEXT: popcntq %rsi, %rdx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %rdi, %rax +; AVX512VL-NEXT: addl %edx, %eax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: addl %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_ctpop_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512POPCNT-NEXT: addl %eax, %r10d +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %r9, %rax +; AVX512POPCNT-NEXT: popcntq %r8, %r8 +; AVX512POPCNT-NEXT: addl %eax, %r8d +; AVX512POPCNT-NEXT: addl %r10d, %r8d +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %rcx, %rax +; AVX512POPCNT-NEXT: xorl %ecx, %ecx +; AVX512POPCNT-NEXT: popcntq %rdx, %rcx +; AVX512POPCNT-NEXT: addl %eax, %ecx +; AVX512POPCNT-NEXT: xorl %edx, %edx +; AVX512POPCNT-NEXT: popcntq %rsi, %rdx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %rdi, %rax +; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: addl %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq %cnt = call i512 @llvm.ctpop.i512(i512 %a0) %res = trunc i512 %cnt to i32 ret i32 %res @@ -177,35 +489,239 @@ define i32 @load_ctpop_i512(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_ctpop_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: popcntq 56(%rdi), %rax -; AVX512-NEXT: popcntq 48(%rdi), %rcx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq 40(%rdi), %rax -; AVX512-NEXT: popcntq 32(%rdi), %rdx -; AVX512-NEXT: addl %eax, %edx -; AVX512-NEXT: addl %ecx, %edx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq 24(%rdi), %rax -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: popcntq 16(%rdi), %rcx -; AVX512-NEXT: popcntq 8(%rdi), %rsi -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq (%rdi), %rax -; AVX512-NEXT: addl %esi, %eax -; AVX512-NEXT: addl %ecx, %eax -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; AVX512F-LABEL: load_ctpop_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: popcntq 56(%rdi), %rax +; AVX512F-NEXT: popcntq 48(%rdi), %rcx +; AVX512F-NEXT: addl %eax, %ecx +; AVX512F-NEXT: popcntq 40(%rdi), %rax +; AVX512F-NEXT: popcntq 32(%rdi), %rdx +; AVX512F-NEXT: addl %eax, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: popcntq 24(%rdi), %rcx +; AVX512F-NEXT: popcntq 16(%rdi), %rsi +; AVX512F-NEXT: popcntq 8(%rdi), %r8 +; AVX512F-NEXT: popcntq (%rdi), %rax +; AVX512F-NEXT: addl %ecx, %esi +; AVX512F-NEXT: addl %r8d, %eax +; AVX512F-NEXT: addl %esi, %eax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctpop_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: popcntq 56(%rdi), %rax +; AVX512VL-NEXT: popcntq 48(%rdi), %rcx +; AVX512VL-NEXT: addl %eax, %ecx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq 40(%rdi), %rax +; AVX512VL-NEXT: popcntq 32(%rdi), %rdx +; AVX512VL-NEXT: addl %eax, %edx +; AVX512VL-NEXT: addl %ecx, %edx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq 24(%rdi), %rax +; AVX512VL-NEXT: xorl %ecx, %ecx +; AVX512VL-NEXT: popcntq 16(%rdi), %rcx +; AVX512VL-NEXT: popcntq 8(%rdi), %rsi +; AVX512VL-NEXT: addl %eax, %ecx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq (%rdi), %rax +; AVX512VL-NEXT: addl %esi, %eax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: addl %edx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctpop_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: popcntq 56(%rdi), %rax +; AVX512POPCNT-NEXT: popcntq 48(%rdi), %rcx +; AVX512POPCNT-NEXT: addl %eax, %ecx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq 40(%rdi), %rax +; AVX512POPCNT-NEXT: popcntq 32(%rdi), %rdx +; AVX512POPCNT-NEXT: addl %eax, %edx +; AVX512POPCNT-NEXT: addl %ecx, %edx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax +; AVX512POPCNT-NEXT: xorl %ecx, %ecx +; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rcx +; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rsi +; AVX512POPCNT-NEXT: addl %eax, %ecx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq (%rdi), %rax +; AVX512POPCNT-NEXT: addl %esi, %eax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq %a0 = load i512, ptr %p0 %cnt = call i512 @llvm.ctpop.i512(i512 %a0) %res = trunc i512 %cnt to i32 ret i32 %res } +define i32 @vector_ctpop_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctpop_i512: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %xmm1, %rdx +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: pextrq $1, %xmm2, %rdi +; SSE-NEXT: movq %xmm2, %r8 +; SSE-NEXT: movq %xmm3, %r9 +; SSE-NEXT: pextrq $1, %xmm3, %r10 +; SSE-NEXT: popcntq %r10, %r10 +; SSE-NEXT: popcntq %r9, %r9 +; SSE-NEXT: addl %r10d, %r9d +; SSE-NEXT: popcntq %rdi, %rdi +; SSE-NEXT: popcntq %r8, %r8 +; SSE-NEXT: addl %edi, %r8d +; SSE-NEXT: addl %r9d, %r8d +; SSE-NEXT: popcntq %rsi, %rsi +; SSE-NEXT: popcntq %rdx, %rdx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: popcntq %rcx, %rcx +; SSE-NEXT: popcntq %rax, %rax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: addl %edx, %eax +; SSE-NEXT: addl %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctpop_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vpextrq $1, %xmm1, %rdi +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %r9 +; AVX2-NEXT: vmovq %xmm0, %r10 +; AVX2-NEXT: popcntq %r9, %r9 +; AVX2-NEXT: popcntq %r10, %r10 +; AVX2-NEXT: addl %r9d, %r10d +; AVX2-NEXT: popcntq %rdi, %rdi +; AVX2-NEXT: popcntq %r8, %r8 +; AVX2-NEXT: addl %edi, %r8d +; AVX2-NEXT: addl %r10d, %r8d +; AVX2-NEXT: popcntq %rsi, %rsi +; AVX2-NEXT: popcntq %rdx, %rdx +; AVX2-NEXT: addl %esi, %edx +; AVX2-NEXT: popcntq %rcx, %rcx +; AVX2-NEXT: popcntq %rax, %rax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: addl %edx, %eax +; AVX2-NEXT: addl %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctpop_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rdi +; AVX512F-NEXT: vmovq %xmm1, %r8 +; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512F-NEXT: vpextrq $1, %xmm0, %r9 +; AVX512F-NEXT: vmovq %xmm0, %r10 +; AVX512F-NEXT: popcntq %r9, %r9 +; AVX512F-NEXT: popcntq %r10, %r10 +; AVX512F-NEXT: addl %r9d, %r10d +; AVX512F-NEXT: popcntq %rdi, %rdi +; AVX512F-NEXT: popcntq %r8, %r8 +; AVX512F-NEXT: addl %edi, %r8d +; AVX512F-NEXT: addl %r10d, %r8d +; AVX512F-NEXT: popcntq %rdx, %rdx +; AVX512F-NEXT: popcntq %rsi, %rsi +; AVX512F-NEXT: addl %edx, %esi +; AVX512F-NEXT: popcntq %rcx, %rcx +; AVX512F-NEXT: popcntq %rax, %rax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: addl %esi, %eax +; AVX512F-NEXT: addl %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctpop_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512VL-NEXT: vmovq %xmm0, %rsi +; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512VL-NEXT: vmovq %xmm1, %rdi +; AVX512VL-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %r9 +; AVX512VL-NEXT: vpextrq $1, %xmm0, %r10 +; AVX512VL-NEXT: popcntq %r10, %r10 +; AVX512VL-NEXT: popcntq %r9, %r9 +; AVX512VL-NEXT: addl %r10d, %r9d +; AVX512VL-NEXT: popcntq %r8, %r8 +; AVX512VL-NEXT: popcntq %rdi, %rdi +; AVX512VL-NEXT: addl %r8d, %edi +; AVX512VL-NEXT: addl %r9d, %edi +; AVX512VL-NEXT: popcntq %rdx, %rdx +; AVX512VL-NEXT: popcntq %rsi, %rsi +; AVX512VL-NEXT: addl %edx, %esi +; AVX512VL-NEXT: popcntq %rcx, %rcx +; AVX512VL-NEXT: popcntq %rax, %rax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: addl %esi, %eax +; AVX512VL-NEXT: addl %edi, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctpop_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vmovq %xmm1, %rax +; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi +; AVX512POPCNT-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; AVX512POPCNT-NEXT: vmovq %xmm1, %rdi +; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512POPCNT-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512POPCNT-NEXT: vmovq %xmm0, %r9 +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %r10 +; AVX512POPCNT-NEXT: popcntq %r10, %r10 +; AVX512POPCNT-NEXT: popcntq %r9, %r9 +; AVX512POPCNT-NEXT: addl %r10d, %r9d +; AVX512POPCNT-NEXT: popcntq %r8, %r8 +; AVX512POPCNT-NEXT: popcntq %rdi, %rdi +; AVX512POPCNT-NEXT: addl %r8d, %edi +; AVX512POPCNT-NEXT: addl %r9d, %edi +; AVX512POPCNT-NEXT: popcntq %rdx, %rdx +; AVX512POPCNT-NEXT: popcntq %rsi, %rsi +; AVX512POPCNT-NEXT: addl %edx, %esi +; AVX512POPCNT-NEXT: popcntq %rcx, %rcx +; AVX512POPCNT-NEXT: popcntq %rax, %rax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: addl %esi, %eax +; AVX512POPCNT-NEXT: addl %edi, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.ctpop.i512(i512 %a0) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + define i32 @test_ctpop_i1024(i1024 %a0) nounwind { ; SSE-LABEL: test_ctpop_i1024: ; SSE: # %bb.0: @@ -309,57 +825,149 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind { ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_ctpop_i1024: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: addl %eax, %r10d -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: addl %eax, %r11d -; AVX512-NEXT: addl %r10d, %r11d -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: xorl %ebx, %ebx -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: xorl %r14d, %r14d -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r14 -; AVX512-NEXT: addl %eax, %ebx -; AVX512-NEXT: xorl %r10d, %r10d -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: addl %r14d, %r10d -; AVX512-NEXT: addl %ebx, %r10d -; AVX512-NEXT: addl %r11d, %r10d -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: xorl %r11d, %r11d -; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: addl %eax, %r11d -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq %r9, %rax -; AVX512-NEXT: popcntq %r8, %r8 -; AVX512-NEXT: addl %eax, %r8d -; AVX512-NEXT: addl %r11d, %r8d -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq %rcx, %rax -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: popcntq %rdx, %rcx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: popcntq %rsi, %rdx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq %rdi, %rax -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: addl %ecx, %eax -; AVX512-NEXT: addl %r8d, %eax -; AVX512-NEXT: addl %r10d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_ctpop_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: addl %eax, %r10d +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: addl %eax, %r11d +; AVX512F-NEXT: addl %r10d, %r11d +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: addl %eax, %ebx +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: addl %r14d, %r10d +; AVX512F-NEXT: addl %ebx, %r10d +; AVX512F-NEXT: addl %r11d, %r10d +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: addl %eax, %r11d +; AVX512F-NEXT: popcntq %r9, %rax +; AVX512F-NEXT: popcntq %r8, %r8 +; AVX512F-NEXT: addl %eax, %r8d +; AVX512F-NEXT: addl %r11d, %r8d +; AVX512F-NEXT: popcntq %rcx, %rax +; AVX512F-NEXT: popcntq %rdx, %rcx +; AVX512F-NEXT: addl %eax, %ecx +; AVX512F-NEXT: popcntq %rsi, %rdx +; AVX512F-NEXT: popcntq %rdi, %rax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: addl %r8d, %eax +; AVX512F-NEXT: addl %r10d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_ctpop_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: addl %eax, %r10d +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 +; AVX512VL-NEXT: addl %eax, %r11d +; AVX512VL-NEXT: addl %r10d, %r11d +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: xorl %ebx, %ebx +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx +; AVX512VL-NEXT: xorl %r14d, %r14d +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r14 +; AVX512VL-NEXT: addl %eax, %ebx +; AVX512VL-NEXT: xorl %r10d, %r10d +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: addl %r14d, %r10d +; AVX512VL-NEXT: addl %ebx, %r10d +; AVX512VL-NEXT: addl %r11d, %r10d +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: xorl %r11d, %r11d +; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 +; AVX512VL-NEXT: addl %eax, %r11d +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %r9, %rax +; AVX512VL-NEXT: popcntq %r8, %r8 +; AVX512VL-NEXT: addl %eax, %r8d +; AVX512VL-NEXT: addl %r11d, %r8d +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %rcx, %rax +; AVX512VL-NEXT: xorl %ecx, %ecx +; AVX512VL-NEXT: popcntq %rdx, %rcx +; AVX512VL-NEXT: addl %eax, %ecx +; AVX512VL-NEXT: xorl %edx, %edx +; AVX512VL-NEXT: popcntq %rsi, %rdx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq %rdi, %rax +; AVX512VL-NEXT: addl %edx, %eax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: addl %r8d, %eax +; AVX512VL-NEXT: addl %r10d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_ctpop_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: pushq %r14 +; AVX512POPCNT-NEXT: pushq %rbx +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512POPCNT-NEXT: addl %eax, %r10d +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 +; AVX512POPCNT-NEXT: addl %eax, %r11d +; AVX512POPCNT-NEXT: addl %r10d, %r11d +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512POPCNT-NEXT: xorl %ebx, %ebx +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx +; AVX512POPCNT-NEXT: xorl %r14d, %r14d +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r14 +; AVX512POPCNT-NEXT: addl %eax, %ebx +; AVX512POPCNT-NEXT: xorl %r10d, %r10d +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10 +; AVX512POPCNT-NEXT: addl %r14d, %r10d +; AVX512POPCNT-NEXT: addl %ebx, %r10d +; AVX512POPCNT-NEXT: addl %r11d, %r10d +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax +; AVX512POPCNT-NEXT: xorl %r11d, %r11d +; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r11 +; AVX512POPCNT-NEXT: addl %eax, %r11d +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %r9, %rax +; AVX512POPCNT-NEXT: popcntq %r8, %r8 +; AVX512POPCNT-NEXT: addl %eax, %r8d +; AVX512POPCNT-NEXT: addl %r11d, %r8d +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %rcx, %rax +; AVX512POPCNT-NEXT: xorl %ecx, %ecx +; AVX512POPCNT-NEXT: popcntq %rdx, %rcx +; AVX512POPCNT-NEXT: addl %eax, %ecx +; AVX512POPCNT-NEXT: xorl %edx, %edx +; AVX512POPCNT-NEXT: popcntq %rsi, %rdx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq %rdi, %rax +; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: addl %r8d, %eax +; AVX512POPCNT-NEXT: addl %r10d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: popq %rbx +; AVX512POPCNT-NEXT: popq %r14 +; AVX512POPCNT-NEXT: retq %cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0) %res = trunc i1024 %cnt to i32 ret i32 %res @@ -460,52 +1068,135 @@ define i32 @load_ctpop_i1024(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_ctpop_i1024: -; AVX512: # %bb.0: -; AVX512-NEXT: popcntq 120(%rdi), %rax -; AVX512-NEXT: popcntq 112(%rdi), %rcx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq 104(%rdi), %rax -; AVX512-NEXT: popcntq 96(%rdi), %rdx -; AVX512-NEXT: addl %eax, %edx -; AVX512-NEXT: addl %ecx, %edx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq 88(%rdi), %rax -; AVX512-NEXT: popcntq 80(%rdi), %rsi -; AVX512-NEXT: popcntq 72(%rdi), %r8 -; AVX512-NEXT: addl %eax, %esi -; AVX512-NEXT: xorl %ecx, %ecx -; AVX512-NEXT: popcntq 64(%rdi), %rcx -; AVX512-NEXT: addl %r8d, %ecx -; AVX512-NEXT: addl %esi, %ecx -; AVX512-NEXT: addl %edx, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq 56(%rdi), %rax -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: popcntq 48(%rdi), %rdx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: popcntq 40(%rdi), %rsi -; AVX512-NEXT: addl %eax, %edx -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: popcntq 32(%rdi), %r8 -; AVX512-NEXT: addl %esi, %r8d -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq 24(%rdi), %rax -; AVX512-NEXT: addl %edx, %r8d -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: popcntq 16(%rdi), %rdx -; AVX512-NEXT: addl %eax, %edx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: popcntq 8(%rdi), %rsi -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: popcntq (%rdi), %rax -; AVX512-NEXT: addl %esi, %eax -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: addl %r8d, %eax -; AVX512-NEXT: addl %ecx, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; AVX512F-LABEL: load_ctpop_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: popcntq 120(%rdi), %rax +; AVX512F-NEXT: popcntq 112(%rdi), %rcx +; AVX512F-NEXT: addl %eax, %ecx +; AVX512F-NEXT: popcntq 104(%rdi), %rax +; AVX512F-NEXT: popcntq 96(%rdi), %rdx +; AVX512F-NEXT: addl %eax, %edx +; AVX512F-NEXT: addl %ecx, %edx +; AVX512F-NEXT: popcntq 88(%rdi), %rax +; AVX512F-NEXT: popcntq 80(%rdi), %rsi +; AVX512F-NEXT: popcntq 72(%rdi), %r8 +; AVX512F-NEXT: popcntq 64(%rdi), %rcx +; AVX512F-NEXT: addl %eax, %esi +; AVX512F-NEXT: addl %r8d, %ecx +; AVX512F-NEXT: addl %esi, %ecx +; AVX512F-NEXT: addl %edx, %ecx +; AVX512F-NEXT: popcntq 56(%rdi), %rax +; AVX512F-NEXT: popcntq 48(%rdi), %rdx +; AVX512F-NEXT: popcntq 40(%rdi), %rsi +; AVX512F-NEXT: popcntq 32(%rdi), %r8 +; AVX512F-NEXT: addl %eax, %edx +; AVX512F-NEXT: addl %esi, %r8d +; AVX512F-NEXT: popcntq 24(%rdi), %rax +; AVX512F-NEXT: addl %edx, %r8d +; AVX512F-NEXT: popcntq 16(%rdi), %rdx +; AVX512F-NEXT: addl %eax, %edx +; AVX512F-NEXT: popcntq 8(%rdi), %rsi +; AVX512F-NEXT: popcntq (%rdi), %rax +; AVX512F-NEXT: addl %esi, %eax +; AVX512F-NEXT: addl %edx, %eax +; AVX512F-NEXT: addl %r8d, %eax +; AVX512F-NEXT: addl %ecx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctpop_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: popcntq 120(%rdi), %rax +; AVX512VL-NEXT: popcntq 112(%rdi), %rcx +; AVX512VL-NEXT: addl %eax, %ecx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq 104(%rdi), %rax +; AVX512VL-NEXT: popcntq 96(%rdi), %rdx +; AVX512VL-NEXT: addl %eax, %edx +; AVX512VL-NEXT: addl %ecx, %edx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq 88(%rdi), %rax +; AVX512VL-NEXT: popcntq 80(%rdi), %rsi +; AVX512VL-NEXT: popcntq 72(%rdi), %r8 +; AVX512VL-NEXT: addl %eax, %esi +; AVX512VL-NEXT: xorl %ecx, %ecx +; AVX512VL-NEXT: popcntq 64(%rdi), %rcx +; AVX512VL-NEXT: addl %r8d, %ecx +; AVX512VL-NEXT: addl %esi, %ecx +; AVX512VL-NEXT: addl %edx, %ecx +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq 56(%rdi), %rax +; AVX512VL-NEXT: xorl %edx, %edx +; AVX512VL-NEXT: popcntq 48(%rdi), %rdx +; AVX512VL-NEXT: xorl %esi, %esi +; AVX512VL-NEXT: popcntq 40(%rdi), %rsi +; AVX512VL-NEXT: addl %eax, %edx +; AVX512VL-NEXT: xorl %r8d, %r8d +; AVX512VL-NEXT: popcntq 32(%rdi), %r8 +; AVX512VL-NEXT: addl %esi, %r8d +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq 24(%rdi), %rax +; AVX512VL-NEXT: addl %edx, %r8d +; AVX512VL-NEXT: xorl %edx, %edx +; AVX512VL-NEXT: popcntq 16(%rdi), %rdx +; AVX512VL-NEXT: addl %eax, %edx +; AVX512VL-NEXT: xorl %esi, %esi +; AVX512VL-NEXT: popcntq 8(%rdi), %rsi +; AVX512VL-NEXT: xorl %eax, %eax +; AVX512VL-NEXT: popcntq (%rdi), %rax +; AVX512VL-NEXT: addl %esi, %eax +; AVX512VL-NEXT: addl %edx, %eax +; AVX512VL-NEXT: addl %r8d, %eax +; AVX512VL-NEXT: addl %ecx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctpop_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: popcntq 120(%rdi), %rax +; AVX512POPCNT-NEXT: popcntq 112(%rdi), %rcx +; AVX512POPCNT-NEXT: addl %eax, %ecx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq 104(%rdi), %rax +; AVX512POPCNT-NEXT: popcntq 96(%rdi), %rdx +; AVX512POPCNT-NEXT: addl %eax, %edx +; AVX512POPCNT-NEXT: addl %ecx, %edx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq 88(%rdi), %rax +; AVX512POPCNT-NEXT: popcntq 80(%rdi), %rsi +; AVX512POPCNT-NEXT: popcntq 72(%rdi), %r8 +; AVX512POPCNT-NEXT: addl %eax, %esi +; AVX512POPCNT-NEXT: xorl %ecx, %ecx +; AVX512POPCNT-NEXT: popcntq 64(%rdi), %rcx +; AVX512POPCNT-NEXT: addl %r8d, %ecx +; AVX512POPCNT-NEXT: addl %esi, %ecx +; AVX512POPCNT-NEXT: addl %edx, %ecx +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq 56(%rdi), %rax +; AVX512POPCNT-NEXT: xorl %edx, %edx +; AVX512POPCNT-NEXT: popcntq 48(%rdi), %rdx +; AVX512POPCNT-NEXT: xorl %esi, %esi +; AVX512POPCNT-NEXT: popcntq 40(%rdi), %rsi +; AVX512POPCNT-NEXT: addl %eax, %edx +; AVX512POPCNT-NEXT: xorl %r8d, %r8d +; AVX512POPCNT-NEXT: popcntq 32(%rdi), %r8 +; AVX512POPCNT-NEXT: addl %esi, %r8d +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax +; AVX512POPCNT-NEXT: addl %edx, %r8d +; AVX512POPCNT-NEXT: xorl %edx, %edx +; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rdx +; AVX512POPCNT-NEXT: addl %eax, %edx +; AVX512POPCNT-NEXT: xorl %esi, %esi +; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rsi +; AVX512POPCNT-NEXT: xorl %eax, %eax +; AVX512POPCNT-NEXT: popcntq (%rdi), %rax +; AVX512POPCNT-NEXT: addl %esi, %eax +; AVX512POPCNT-NEXT: addl %edx, %eax +; AVX512POPCNT-NEXT: addl %r8d, %eax +; AVX512POPCNT-NEXT: addl %ecx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq %a0 = load i1024, ptr %p0 %cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0) %res = trunc i1024 %cnt to i32 @@ -596,6 +1287,75 @@ define i32 @load_ctlz_i128(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_ctlz_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: pextrq $1, %xmm0, %rdx +; SSE-NEXT: bsrq %rdx, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: movl $127, %eax +; SSE-NEXT: bsrq %rcx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: lzcntq %rcx, %rdx +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_i128: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: lzcntq %rcx, %rdx +; AVX512F-NEXT: lzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_i128: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: lzcntq %rcx, %rdx +; AVX512VL-NEXT: lzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_i128: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: lzcntq %rcx, %rdx +; AVX512POPCNT-NEXT: lzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + define i32 @test_ctlz_i256(i256 %a0) nounwind { ; SSE-LABEL: test_ctlz_i256: ; SSE: # %bb.0: @@ -710,32 +1470,177 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_ctlz_i256: -; AVX512: # %bb.0: -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: movq 16(%rdi), %rdx -; AVX512-NEXT: movq 24(%rdi), %rsi -; AVX512-NEXT: lzcntq %rsi, %rax -; AVX512-NEXT: lzcntq %rdx, %r8 -; AVX512-NEXT: addl $64, %r8d -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %eax, %r8d -; AVX512-NEXT: lzcntq %rcx, %r9 -; AVX512-NEXT: lzcntq (%rdi), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %r9d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: cmovnel %r8d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; AVX512F-LABEL: load_ctlz_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = mem[3,2,1,0] +; AVX512F-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-NEXT: vpcompressq %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctlz_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512VL-NEXT: vplzcntq %ymm0, %ymm1 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctlz_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512POPCNT-NEXT: vplzcntq %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i256, ptr %p0 %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0) %res = trunc i256 %cnt to i32 ret i32 %res } +define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_i256: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: pextrq $1, %xmm0, %rdx +; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: bsrq %rsi, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: bsrq %rax, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %r8d +; SSE-NEXT: bsrq %rdx, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: movl $127, %eax +; SSE-NEXT: bsrq %rcx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm1, %xmm1 +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: lzcntq %rsi, %rdi +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: lzcntq %rcx, %rdi +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: lzcntq %rsi, %rdi +; AVX512F-NEXT: lzcntq %rdx, %r8 +; AVX512F-NEXT: addl $64, %r8d +; AVX512F-NEXT: testq %rsi, %rsi +; AVX512F-NEXT: cmovnel %edi, %r8d +; AVX512F-NEXT: lzcntq %rcx, %rdi +; AVX512F-NEXT: lzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edi, %eax +; AVX512F-NEXT: subl $-128, %eax +; AVX512F-NEXT: orq %rsi, %rdx +; AVX512F-NEXT: cmovnel %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rdx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512VL-NEXT: lzcntq %rsi, %rdi +; AVX512VL-NEXT: lzcntq %rdx, %r8 +; AVX512VL-NEXT: addl $64, %r8d +; AVX512VL-NEXT: testq %rsi, %rsi +; AVX512VL-NEXT: cmovnel %edi, %r8d +; AVX512VL-NEXT: lzcntq %rcx, %rdi +; AVX512VL-NEXT: lzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edi, %eax +; AVX512VL-NEXT: subl $-128, %eax +; AVX512VL-NEXT: orq %rsi, %rdx +; AVX512VL-NEXT: cmovnel %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi +; AVX512POPCNT-NEXT: lzcntq %rdx, %r8 +; AVX512POPCNT-NEXT: addl $64, %r8d +; AVX512POPCNT-NEXT: testq %rsi, %rsi +; AVX512POPCNT-NEXT: cmovnel %edi, %r8d +; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi +; AVX512POPCNT-NEXT: lzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edi, %eax +; AVX512POPCNT-NEXT: subl $-128, %eax +; AVX512POPCNT-NEXT: orq %rsi, %rdx +; AVX512POPCNT-NEXT: cmovnel %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + define i32 @test_ctlz_i512(i512 %a0) nounwind { ; SSE-LABEL: test_ctlz_i512: ; SSE: # %bb.0: @@ -843,50 +1748,76 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind { ; AVX2-NEXT: popq %r15 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_ctlz_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: lzcntq %r11, %rax -; AVX512-NEXT: lzcntq %r10, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: lzcntq %r9, %rax -; AVX512-NEXT: lzcntq %r8, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: subl $-128, %ebx -; AVX512-NEXT: movq %r10, %rax -; AVX512-NEXT: orq %r11, %rax -; AVX512-NEXT: cmovnel %r14d, %ebx -; AVX512-NEXT: lzcntq %rcx, %rax -; AVX512-NEXT: lzcntq %rdx, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: lzcntq %rsi, %r15 -; AVX512-NEXT: lzcntq %rdi, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %r15d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rcx, %rdx -; AVX512-NEXT: cmovnel %r14d, %eax -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: orq %r9, %r8 -; AVX512-NEXT: cmovnel %ebx, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_ctlz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rdi, %xmm0 +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vmovq %rcx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vmovq %r9, %xmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_ctlz_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %rdi, %xmm0 +; AVX512VL-NEXT: vmovq %rsi, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vmovq %rcx, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512VL-NEXT: vmovq %r8, %xmm2 +; AVX512VL-NEXT: vmovq %r9, %xmm3 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_ctlz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm3 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0) %res = trunc i512 %cnt to i32 ret i32 %res @@ -1008,59 +1939,194 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind { ; AVX2-NEXT: popq %r15 ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_ctlz_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 8(%rdi), %r11 -; AVX512-NEXT: movq 16(%rdi), %r9 -; AVX512-NEXT: movq 24(%rdi), %r10 -; AVX512-NEXT: movq 32(%rdi), %rcx -; AVX512-NEXT: movq 40(%rdi), %rdx -; AVX512-NEXT: movq 48(%rdi), %rsi -; AVX512-NEXT: movq 56(%rdi), %r8 -; AVX512-NEXT: lzcntq %r8, %rax -; AVX512-NEXT: lzcntq %rsi, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: lzcntq %rdx, %rax -; AVX512-NEXT: lzcntq %rcx, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: subl $-128, %ebx -; AVX512-NEXT: movq %rsi, %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: cmovnel %r14d, %ebx -; AVX512-NEXT: lzcntq %r10, %rax -; AVX512-NEXT: lzcntq %r9, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: lzcntq (%rdi), %rax -; AVX512-NEXT: lzcntq %r11, %rdi -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: cmovnel %edi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r10, %r9 -; AVX512-NEXT: cmovnel %r14d, %eax -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r8, %rdx -; AVX512-NEXT: orq %rsi, %rcx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: cmovnel %ebx, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_ctlz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctlz_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctlz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i512, ptr %p0 %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0) %res = trunc i512 %cnt to i32 ret i32 %res } +define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_i512: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rdx +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: pextrq $1, %xmm2, %rdi +; SSE-NEXT: movq %xmm2, %rsi +; SSE-NEXT: movq %xmm3, %r8 +; SSE-NEXT: pextrq $1, %xmm3, %r9 +; SSE-NEXT: bsrq %r9, %r10 +; SSE-NEXT: xorl $63, %r10d +; SSE-NEXT: bsrq %r8, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovnel %r10d, %r8d +; SSE-NEXT: bsrq %rdi, %r9 +; SSE-NEXT: xorl $63, %r9d +; SSE-NEXT: bsrq %rsi, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: orl $64, %esi +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %r9d, %esi +; SSE-NEXT: movq %xmm1, %rdi +; SSE-NEXT: subl $-128, %esi +; SSE-NEXT: ptest %xmm3, %xmm3 +; SSE-NEXT: cmovnel %r8d, %esi +; SSE-NEXT: bsrq %rax, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: bsrq %rdi, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: orl $64, %edi +; SSE-NEXT: testq %rax, %rax +; SSE-NEXT: cmovnel %r8d, %edi +; SSE-NEXT: bsrq %rcx, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: movl $127, %eax +; SSE-NEXT: bsrq %rdx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm1, %xmm1 +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: ptest %xmm2, %xmm2 +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vmovq %xmm2, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vmovq %xmm2, %r8 +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vmovq %xmm1, %rdi +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: lzcntq %rax, %r10 +; AVX2-NEXT: lzcntq %r8, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: cmovnel %r10d, %r11d +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: lzcntq %r9, %r10 +; AVX2-NEXT: lzcntq %rdi, %rdi +; AVX2-NEXT: addl $64, %edi +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %r10d, %edi +; AVX2-NEXT: subl $-128, %edi +; AVX2-NEXT: orq %rax, %r8 +; AVX2-NEXT: cmovnel %r11d, %edi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rcx, %rax +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %r8d +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: lzcntq %rsi, %r9 +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: vptest %ymm1, %ymm1 +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + define i32 @test_ctlz_i1024(i1024 %a0) nounwind { ; SSE-LABEL: test_ctlz_i1024: ; SSE: # %bb.0: @@ -1312,116 +2378,151 @@ define i32 @test_ctlz_i1024(i1024 %a0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_ctlz_i1024: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq %r9, %r14 -; AVX512-NEXT: movq %r8, %r11 -; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: lzcntq %r12, %rcx -; AVX512-NEXT: lzcntq %r8, %r9 -; AVX512-NEXT: addl $64, %r9d -; AVX512-NEXT: testq %r12, %r12 -; AVX512-NEXT: cmovnel %ecx, %r9d -; AVX512-NEXT: lzcntq %r10, %rsi -; AVX512-NEXT: lzcntq %rax, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %esi, %ecx -; AVX512-NEXT: subl $-128, %ecx -; AVX512-NEXT: movq %r8, %rsi -; AVX512-NEXT: orq %r12, %rsi -; AVX512-NEXT: cmovnel %r9d, %ecx -; AVX512-NEXT: lzcntq %rbx, %rdi -; AVX512-NEXT: lzcntq %r15, %rsi -; AVX512-NEXT: addl $64, %esi -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %edi, %esi -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: lzcntq %r13, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: lzcntq %r9, %rdi -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: cmovnel %edi, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %r15, %rdi -; AVX512-NEXT: orq %rbx, %rdi -; AVX512-NEXT: cmovnel %esi, %ebp -; AVX512-NEXT: addl $256, %ebp # imm = 0x100 -; AVX512-NEXT: movq %r10, %rdi -; AVX512-NEXT: orq %r12, %rdi -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rdi, %rsi -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: lzcntq %rdi, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: lzcntq %r12, %rcx -; AVX512-NEXT: testq %r12, %r12 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %r11, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: lzcntq %r14, %rsi -; AVX512-NEXT: testq %r14, %r14 -; AVX512-NEXT: cmovnel %esi, %ecx -; AVX512-NEXT: subl $-128, %ecx -; AVX512-NEXT: movq %rdi, %rsi -; AVX512-NEXT: orq %r12, %rsi -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq %rdx, %rdi -; AVX512-NEXT: lzcntq %rdx, %rdx -; AVX512-NEXT: addl $64, %edx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: lzcntq %r10, %rax -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %eax, %edx -; AVX512-NEXT: lzcntq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: lzcntq %rsi, %r8 -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %r8d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r10, %rdi -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: orq %r12, %r14 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r14, %r11 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: orq %rbx, %r9 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r15 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: orq %r15, %r13 -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r9, %r13 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq +; AVX512F-LABEL: test_ctlz_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: vmovq %rdi, %xmm0 +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vmovq %rcx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r8, %xmm1 +; AVX512F-NEXT: vmovq %r9, %xmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: orq %r14, %r11 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: orq %rbx, %r10 +; AVX512F-NEXT: orq %r11, %r10 +; AVX512F-NEXT: cmovel %ecx, %eax +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_ctlz_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512VL-NEXT: vmovq %rdi, %xmm0 +; AVX512VL-NEXT: vmovq %rsi, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vmovq %rcx, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512VL-NEXT: vmovq %r8, %xmm2 +; AVX512VL-NEXT: vmovq %r9, %xmm3 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %ecx +; AVX512VL-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512VL-NEXT: orq %r14, %r11 +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: orq %rbx, %r10 +; AVX512VL-NEXT: orq %r11, %r10 +; AVX512VL-NEXT: cmovel %ecx, %eax +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_ctlz_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: pushq %r14 +; AVX512POPCNT-NEXT: pushq %rbx +; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm3 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %ecx +; AVX512POPCNT-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512POPCNT-NEXT: orq %r14, %r11 +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512POPCNT-NEXT: orq %rbx, %r10 +; AVX512POPCNT-NEXT: orq %r11, %r10 +; AVX512POPCNT-NEXT: cmovel %ecx, %eax +; AVX512POPCNT-NEXT: popq %rbx +; AVX512POPCNT-NEXT: popq %r14 +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0) %res = trunc i1024 %cnt to i32 ret i32 %res @@ -1687,121 +2788,1768 @@ define i32 @load_ctlz_i1024(ptr %p0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_ctlz_i1024: +; AVX512F-LABEL: load_ctlz_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq 80(%rdi), %rsi +; AVX512F-NEXT: movq 64(%rdi), %rcx +; AVX512F-NEXT: movq 72(%rdi), %rdx +; AVX512F-NEXT: movq 88(%rdi), %r8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm1, %r9d +; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: orq 120(%rdi), %r8 +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq 104(%rdi), %rdx +; AVX512F-NEXT: orq %r8, %rdx +; AVX512F-NEXT: orq 112(%rdi), %rsi +; AVX512F-NEXT: orq 96(%rdi), %rcx +; AVX512F-NEXT: orq %rsi, %rcx +; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: cmovnel %r9d, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctlz_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq 80(%rdi), %rsi +; AVX512VL-NEXT: movq 64(%rdi), %rcx +; AVX512VL-NEXT: movq 72(%rdi), %rdx +; AVX512VL-NEXT: movq 88(%rdi), %r8 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm1, %r9d +; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VL-NEXT: orq 120(%rdi), %r8 +; AVX512VL-NEXT: orq 104(%rdi), %rdx +; AVX512VL-NEXT: orq 112(%rdi), %rsi +; AVX512VL-NEXT: orq %r8, %rdx +; AVX512VL-NEXT: orq 96(%rdi), %rcx +; AVX512VL-NEXT: orq %rsi, %rcx +; AVX512VL-NEXT: orq %rdx, %rcx +; AVX512VL-NEXT: cmovnel %r9d, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctlz_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: movq 80(%rdi), %rsi +; AVX512POPCNT-NEXT: movq 64(%rdi), %rcx +; AVX512POPCNT-NEXT: movq 72(%rdi), %rdx +; AVX512POPCNT-NEXT: movq 88(%rdi), %r8 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm1, %r9d +; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq 120(%rdi), %r8 +; AVX512POPCNT-NEXT: orq 104(%rdi), %rdx +; AVX512POPCNT-NEXT: orq 112(%rdi), %rsi +; AVX512POPCNT-NEXT: orq %r8, %rdx +; AVX512POPCNT-NEXT: orq 96(%rdi), %rcx +; AVX512POPCNT-NEXT: orq %rsi, %rcx +; AVX512POPCNT-NEXT: orq %rdx, %rcx +; AVX512POPCNT-NEXT: cmovnel %r9d, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i1024, ptr %p0 + %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0) + %res = trunc i1024 %cnt to i32 + ret i32 %res +} + +; +; CTLZ_ZERO_UNDEF +; + +define i32 @test_ctlz_undef_i128(i128 %a0) nounwind { +; SSE-LABEL: test_ctlz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: bsrq %rsi, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: bsrq %rdi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ctlz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: lzcntq %rsi, %rcx +; AVX2-NEXT: lzcntq %rdi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctlz_undef_i128: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 32(%rdi), %r14 -; AVX512-NEXT: movq 48(%rdi), %rbp -; AVX512-NEXT: movq 64(%rdi), %r11 -; AVX512-NEXT: movq 72(%rdi), %r10 -; AVX512-NEXT: movq 80(%rdi), %rdx -; AVX512-NEXT: movq 88(%rdi), %rbx -; AVX512-NEXT: movq 96(%rdi), %rsi -; AVX512-NEXT: movq 104(%rdi), %r9 -; AVX512-NEXT: movq 112(%rdi), %r8 -; AVX512-NEXT: movq 120(%rdi), %r15 -; AVX512-NEXT: lzcntq %r15, %rax -; AVX512-NEXT: lzcntq %r8, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: testq %r15, %r15 -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: lzcntq %r9, %r12 -; AVX512-NEXT: lzcntq %rsi, %rax -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %r8, %r12 -; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: orq %r15, %r12 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %rbx, %rcx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: lzcntq %rdx, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %ecx, %r13d -; AVX512-NEXT: lzcntq %r10, %rcx -; AVX512-NEXT: lzcntq %r11, %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %ecx, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %rdx, %rcx -; AVX512-NEXT: orq %rbx, %rcx -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r9, %rcx -; AVX512-NEXT: orq %r15, %rcx -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: movq 56(%rdi), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: lzcntq %r13, %rcx -; AVX512-NEXT: movq %rbp, %rsi -; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: lzcntq %rbp, %rax +; AVX512-NEXT: lzcntq %rsi, %rcx +; AVX512-NEXT: lzcntq %rdi, %rax ; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r13, %r13 +; AVX512-NEXT: testq %rsi, %rsi ; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %r14, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: lzcntq %r8, %rdx -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %edx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %rsi, %rdx -; AVX512-NEXT: orq %r13, %rdx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq 16(%rdi), %r9 -; AVX512-NEXT: lzcntq %r9, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq 24(%rdi), %rdx -; AVX512-NEXT: lzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq 8(%rdi), %rsi +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @load_ctlz_undef_i128(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: bsrq %rcx, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: bsrq (%rdi), %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: load_ctlz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: lzcntq %rcx, %rdx +; AVX2-NEXT: lzcntq (%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_ctlz_undef_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movq 8(%rdi), %rcx +; AVX512-NEXT: lzcntq %rcx, %rdx ; AVX512-NEXT: lzcntq (%rdi), %rax ; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: lzcntq %rsi, %rdi +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %a0 = load i128, ptr %p0 + %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: bsrq %rcx, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: bsrq %rax, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: lzcntq %rcx, %rdx +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_undef_i128: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: lzcntq %rcx, %rdx +; AVX512F-NEXT: lzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edx, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_undef_i128: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: lzcntq %rcx, %rdx +; AVX512VL-NEXT: lzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edx, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_undef_i128: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: lzcntq %rcx, %rdx +; AVX512POPCNT-NEXT: lzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edx, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @test_ctlz_undef_i256(i256 %a0) nounwind { +; SSE-LABEL: test_ctlz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: bsrq %rcx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rdx, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %eax, %r8d +; SSE-NEXT: bsrq %rsi, %r9 +; SSE-NEXT: xorl $63, %r9d +; SSE-NEXT: bsrq %rdi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ctlz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: lzcntq %rcx, %rax +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %r8d +; AVX2-NEXT: lzcntq %rsi, %r9 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_ctlz_undef_i256: +; AVX512: # %bb.0: +; AVX512-NEXT: lzcntq %rcx, %rax +; AVX512-NEXT: lzcntq %rdx, %r8 +; AVX512-NEXT: addl $64, %r8d +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %eax, %r8d +; AVX512-NEXT: lzcntq %rsi, %r9 +; AVX512-NEXT: lzcntq %rdi, %rax +; AVX512-NEXT: addl $64, %eax ; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %edi, %eax +; AVX512-NEXT: cmovnel %r9d, %eax ; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rdx, %r9 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq %r13, %r8 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: orq %r15, %rbx -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; AVX512-NEXT: orq %rbx, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; AVX512-NEXT: orq %rcx, %r11 -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r10, %r11 -; AVX512-NEXT: cmovnel %r12d, %eax +; AVX512-NEXT: orq %rcx, %rdx +; AVX512-NEXT: cmovnel %r8d, %eax ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq + %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: movq 16(%rdi), %rcx +; SSE-NEXT: movq 24(%rdi), %rsi +; SSE-NEXT: bsrq %rsi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rcx, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %eax, %r8d +; SSE-NEXT: bsrq %rdx, %r9 +; SSE-NEXT: xorl $63, %r9d +; SSE-NEXT: bsrq (%rdi), %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rsi, %rcx +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: load_ctlz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: movq 16(%rdi), %rcx +; AVX2-NEXT: movq 24(%rdi), %rdx +; AVX2-NEXT: lzcntq %rdx, %rax +; AVX2-NEXT: lzcntq %rcx, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %esi +; AVX2-NEXT: movq 8(%rdi), %r8 +; AVX2-NEXT: lzcntq %r8, %r9 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq (%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_ctlz_undef_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctlz_undef_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vplzcntq %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpcompressq %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctlz_undef_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512POPCNT-NEXT: vplzcntq %ymm0, %ymm0 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512POPCNT-NEXT: vpcompressq %ymm0, %ymm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i256, ptr %p0 + %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movq %xmm1, %rdx +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: bsrq %rsi, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: bsrq %rdx, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %edx +; SSE-NEXT: bsrq %rcx, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: bsrq %rax, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm1, %xmm1 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: lzcntq %rsi, %rdi +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: lzcntq %rcx, %rdi +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_undef_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: lzcntq %rsi, %rdi +; AVX512F-NEXT: lzcntq %rdx, %r8 +; AVX512F-NEXT: addl $64, %r8d +; AVX512F-NEXT: testq %rsi, %rsi +; AVX512F-NEXT: cmovnel %edi, %r8d +; AVX512F-NEXT: lzcntq %rcx, %rdi +; AVX512F-NEXT: lzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edi, %eax +; AVX512F-NEXT: subl $-128, %eax +; AVX512F-NEXT: orq %rsi, %rdx +; AVX512F-NEXT: cmovnel %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_undef_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rdx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512VL-NEXT: lzcntq %rsi, %rdi +; AVX512VL-NEXT: lzcntq %rdx, %r8 +; AVX512VL-NEXT: addl $64, %r8d +; AVX512VL-NEXT: testq %rsi, %rsi +; AVX512VL-NEXT: cmovnel %edi, %r8d +; AVX512VL-NEXT: lzcntq %rcx, %rdi +; AVX512VL-NEXT: lzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edi, %eax +; AVX512VL-NEXT: subl $-128, %eax +; AVX512VL-NEXT: orq %rsi, %rdx +; AVX512VL-NEXT: cmovnel %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_undef_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rax +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi +; AVX512POPCNT-NEXT: lzcntq %rdx, %r8 +; AVX512POPCNT-NEXT: addl $64, %r8d +; AVX512POPCNT-NEXT: testq %rsi, %rsi +; AVX512POPCNT-NEXT: cmovnel %edi, %r8d +; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi +; AVX512POPCNT-NEXT: lzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edi, %eax +; AVX512POPCNT-NEXT: subl $-128, %eax +; AVX512POPCNT-NEXT: orq %rsi, %rdx +; AVX512POPCNT-NEXT: cmovnel %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { +; SSE-LABEL: test_ctlz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: bsrq %r11, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r10, %r14 +; SSE-NEXT: xorl $63, %r14d +; SSE-NEXT: orl $64, %r14d +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: bsrq %r9, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r8, %rbx +; SSE-NEXT: xorl $63, %ebx +; SSE-NEXT: orl $64, %ebx +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovnel %eax, %ebx +; SSE-NEXT: subl $-128, %ebx +; SSE-NEXT: movq %r10, %rax +; SSE-NEXT: orq %r11, %rax +; SSE-NEXT: cmovnel %r14d, %ebx +; SSE-NEXT: bsrq %rcx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rdx, %r14 +; SSE-NEXT: xorl $63, %r14d +; SSE-NEXT: orl $64, %r14d +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: bsrq %rsi, %r15 +; SSE-NEXT: xorl $63, %r15d +; SSE-NEXT: bsrq %rdi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r11, %r9 +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: cmovnel %ebx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ctlz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: lzcntq %r11, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: lzcntq %r10, %r14 +; AVX2-NEXT: addl $64, %r14d +; AVX2-NEXT: testq %r11, %r11 +; AVX2-NEXT: cmovnel %eax, %r14d +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %r9, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: lzcntq %r8, %rbx +; AVX2-NEXT: addl $64, %ebx +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %eax, %ebx +; AVX2-NEXT: subl $-128, %ebx +; AVX2-NEXT: movq %r10, %rax +; AVX2-NEXT: orq %r11, %rax +; AVX2-NEXT: cmovnel %r14d, %ebx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rcx, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: lzcntq %rdx, %r14 +; AVX2-NEXT: addl $64, %r14d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %r14d +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: lzcntq %rsi, %r15 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r14d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: orq %r9, %r8 +; AVX2-NEXT: cmovnel %ebx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_ctlz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rdi, %xmm0 +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vmovq %rcx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r8, %xmm1 +; AVX512F-NEXT: vmovq %r9, %xmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_ctlz_undef_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %rdi, %xmm0 +; AVX512VL-NEXT: vmovq %rsi, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vmovq %rcx, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512VL-NEXT: vmovq %r8, %xmm2 +; AVX512VL-NEXT: vmovq %r9, %xmm3 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_ctlz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm3 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq 8(%rdi), %r11 +; SSE-NEXT: movq 16(%rdi), %r9 +; SSE-NEXT: movq 24(%rdi), %r10 +; SSE-NEXT: movq 32(%rdi), %rcx +; SSE-NEXT: movq 40(%rdi), %rdx +; SSE-NEXT: movq 48(%rdi), %rsi +; SSE-NEXT: movq 56(%rdi), %r8 +; SSE-NEXT: bsrq %r8, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rsi, %r14 +; SSE-NEXT: xorl $63, %r14d +; SSE-NEXT: orl $64, %r14d +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: bsrq %rdx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rcx, %rbx +; SSE-NEXT: xorl $63, %ebx +; SSE-NEXT: orl $64, %ebx +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %ebx +; SSE-NEXT: subl $-128, %ebx +; SSE-NEXT: movq %rsi, %rax +; SSE-NEXT: orq %r8, %rax +; SSE-NEXT: cmovnel %r14d, %ebx +; SSE-NEXT: bsrq %r10, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r9, %r14 +; SSE-NEXT: xorl $63, %r14d +; SSE-NEXT: orl $64, %r14d +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %eax, %r14d +; SSE-NEXT: bsrq %r11, %r15 +; SSE-NEXT: xorl $63, %r15d +; SSE-NEXT: bsrq (%rdi), %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r10, %r9 +; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r8, %rdx +; SSE-NEXT: orq %rsi, %rcx +; SSE-NEXT: orq %rdx, %rcx +; SSE-NEXT: cmovnel %ebx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: load_ctlz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 8(%rdi), %r10 +; AVX2-NEXT: movq 16(%rdi), %r9 +; AVX2-NEXT: movq 32(%rdi), %rcx +; AVX2-NEXT: movq 40(%rdi), %rdx +; AVX2-NEXT: movq 48(%rdi), %rsi +; AVX2-NEXT: movq 56(%rdi), %r8 +; AVX2-NEXT: lzcntq %r8, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: lzcntq %rsi, %rbx +; AVX2-NEXT: addl $64, %ebx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %eax, %ebx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdx, %rax +; AVX2-NEXT: lzcntq %rcx, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %r11d +; AVX2-NEXT: subl $-128, %r11d +; AVX2-NEXT: movq %rsi, %rax +; AVX2-NEXT: orq %r8, %rax +; AVX2-NEXT: cmovnel %ebx, %r11d +; AVX2-NEXT: movq 24(%rdi), %rbx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rbx, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: lzcntq %r9, %r14 +; AVX2-NEXT: addl $64, %r14d +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %eax, %r14d +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: lzcntq %r10, %r15 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq (%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rbx, %r9 +; AVX2-NEXT: cmovnel %r14d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r8, %rdx +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: cmovnel %r11d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_ctlz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctlz_undef_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctlz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i512, ptr %p0 + %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_ctlz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: pextrq $1, %xmm2, %rsi +; SSE-NEXT: movq %xmm2, %rdx +; SSE-NEXT: movq %xmm3, %rdi +; SSE-NEXT: pextrq $1, %xmm3, %r8 +; SSE-NEXT: bsrq %r8, %r9 +; SSE-NEXT: xorl $63, %r9d +; SSE-NEXT: bsrq %rdi, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: orl $64, %edi +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %r9d, %edi +; SSE-NEXT: bsrq %rsi, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: bsrq %rdx, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %r8d, %edx +; SSE-NEXT: movq %xmm0, %rsi +; SSE-NEXT: subl $-128, %edx +; SSE-NEXT: ptest %xmm3, %xmm3 +; SSE-NEXT: movq %xmm1, %r8 +; SSE-NEXT: cmovnel %edi, %edx +; SSE-NEXT: bsrq %rax, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: bsrq %r8, %r8 +; SSE-NEXT: xorl $63, %r8d +; SSE-NEXT: orl $64, %r8d +; SSE-NEXT: testq %rax, %rax +; SSE-NEXT: cmovnel %edi, %r8d +; SSE-NEXT: bsrq %rcx, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: bsrq %rsi, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm1, %xmm1 +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: ptest %xmm2, %xmm2 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_ctlz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vmovq %xmm2, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vmovq %xmm2, %r8 +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: vmovq %xmm1, %rdi +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: lzcntq %rax, %r10 +; AVX2-NEXT: lzcntq %r8, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: cmovnel %r10d, %r11d +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: lzcntq %r9, %r10 +; AVX2-NEXT: lzcntq %rdi, %rdi +; AVX2-NEXT: addl $64, %edi +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %r10d, %edi +; AVX2-NEXT: subl $-128, %edi +; AVX2-NEXT: orq %rax, %r8 +; AVX2-NEXT: cmovnel %r11d, %edi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rcx, %rax +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: lzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %r8d +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: lzcntq %rsi, %r9 +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: vptest %ymm1, %ymm1 +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_ctlz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_ctlz_undef_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_ctlz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { +; SSE-LABEL: test_ctlz_undef_i1024: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %r9, %r12 +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: bsrq %r11, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %rsi, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: orl $64, %ecx +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: bsrq %rdx, %r10 +; SSE-NEXT: xorl $63, %r10d +; SSE-NEXT: bsrq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %r10d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %rsi, %r9 +; SSE-NEXT: movq %rsi, %rbx +; SSE-NEXT: orq %r11, %r9 +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: bsrq %r15, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: bsrq %r13, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: orl $64, %esi +; SSE-NEXT: testq %r15, %r15 +; SSE-NEXT: cmovnel %ecx, %esi +; SSE-NEXT: bsrq %r14, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; SSE-NEXT: bsrq %r9, %rbp +; SSE-NEXT: xorl $63, %ebp +; SSE-NEXT: orl $64, %ebp +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovnel %ecx, %ebp +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: subl $-128, %ebp +; SSE-NEXT: movq %r13, %rcx +; SSE-NEXT: orq %r15, %rcx +; SSE-NEXT: cmovnel %esi, %ebp +; SSE-NEXT: addl $256, %ebp # imm = 0x100 +; SSE-NEXT: orq %r11, %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: orq %rbx, %rsi +; SSE-NEXT: orq %rdx, %rsi +; SSE-NEXT: cmovnel %eax, %ebp +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: bsrq %rdx, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE-NEXT: bsrq %r8, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: bsrq %r12, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: bsrq %r10, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: orl $64, %ecx +; SSE-NEXT: testq %r12, %r12 +; SSE-NEXT: cmovnel %esi, %ecx +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: subl $-128, %ecx +; SSE-NEXT: movq %r8, %rsi +; SSE-NEXT: orq %rdx, %rsi +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE-NEXT: bsrq %r11, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; SSE-NEXT: bsrq %r8, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %eax, %edx +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; SSE-NEXT: bsrq %rdi, %rsi +; SSE-NEXT: xorl $63, %esi +; SSE-NEXT: bsrq %rbx, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r12 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r12, %r10 +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r15 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; SSE-NEXT: orq %r15, %r14 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; SSE-NEXT: orq %r13, %r9 +; SSE-NEXT: addl $512, %eax # imm = 0x200 +; SSE-NEXT: orq %r14, %r9 +; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: test_ctlz_undef_i1024: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq %r9, %r14 +; AVX2-NEXT: movq %r8, %r11 +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r12, %rcx +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: lzcntq %r8, %r9 +; AVX2-NEXT: addl $64, %r9d +; AVX2-NEXT: testq %r12, %r12 +; AVX2-NEXT: cmovnel %ecx, %r9d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: lzcntq %r10, %rsi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %rax, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %esi, %ecx +; AVX2-NEXT: subl $-128, %ecx +; AVX2-NEXT: movq %r8, %rsi +; AVX2-NEXT: orq %r12, %rsi +; AVX2-NEXT: cmovnel %r9d, %ecx +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: lzcntq %rbx, %rdi +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: lzcntq %r15, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %edi, %esi +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: lzcntq %r13, %rbp +; AVX2-NEXT: addl $64, %ebp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: lzcntq %r9, %rdi +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %edi, %ebp +; AVX2-NEXT: subl $-128, %ebp +; AVX2-NEXT: movq %r15, %rdi +; AVX2-NEXT: orq %rbx, %rdi +; AVX2-NEXT: cmovnel %esi, %ebp +; AVX2-NEXT: addl $256, %ebp # imm = 0x100 +; AVX2-NEXT: movq %r10, %rdi +; AVX2-NEXT: orq %r12, %rdi +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: orq %rdi, %rsi +; AVX2-NEXT: cmovnel %ecx, %ebp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r12, %rcx +; AVX2-NEXT: testq %r12, %r12 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r11, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: lzcntq %r14, %rsi +; AVX2-NEXT: testq %r14, %r14 +; AVX2-NEXT: cmovnel %esi, %ecx +; AVX2-NEXT: subl $-128, %ecx +; AVX2-NEXT: movq %rdi, %rsi +; AVX2-NEXT: orq %r12, %rsi +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: lzcntq %rdx, %rdx +; AVX2-NEXT: addl $64, %edx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %r10, %rax +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %eax, %edx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: lzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; AVX2-NEXT: lzcntq %rsi, %r8 +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %r10, %rdi +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: orq %r12, %r14 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r14, %r11 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; AVX2-NEXT: orq %rbx, %r9 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r15 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: orq %r15, %r13 +; AVX2-NEXT: addl $512, %eax # imm = 0x200 +; AVX2-NEXT: orq %r9, %r13 +; AVX2-NEXT: cmovnel %ebp, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_ctlz_undef_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: vmovq %rdi, %xmm0 +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vmovq %rcx, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vmovq %r9, %xmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %ecx +; AVX512F-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: orq %r14, %r11 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: orq %rbx, %r10 +; AVX512F-NEXT: orq %r11, %r10 +; AVX512F-NEXT: cmovel %ecx, %eax +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_ctlz_undef_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512VL-NEXT: vmovq %rdi, %xmm0 +; AVX512VL-NEXT: vmovq %rsi, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vmovq %rcx, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512VL-NEXT: vmovq %r8, %xmm2 +; AVX512VL-NEXT: vmovq %r9, %xmm3 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %ecx +; AVX512VL-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512VL-NEXT: orq %r14, %r11 +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: orq %rbx, %r10 +; AVX512VL-NEXT: orq %r11, %r10 +; AVX512VL-NEXT: cmovel %ecx, %eax +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_ctlz_undef_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: pushq %r14 +; AVX512POPCNT-NEXT: pushq %rbx +; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm3 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %ecx +; AVX512POPCNT-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512POPCNT-NEXT: orq %r14, %r11 +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512POPCNT-NEXT: orq %rbx, %r10 +; AVX512POPCNT-NEXT: orq %r11, %r10 +; AVX512POPCNT-NEXT: cmovel %ecx, %eax +; AVX512POPCNT-NEXT: popq %rbx +; AVX512POPCNT-NEXT: popq %r14 +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1) + %res = trunc i1024 %cnt to i32 + ret i32 %res +} + +define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { +; SSE-LABEL: load_ctlz_undef_i1024: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq 40(%rdi), %rbp +; SSE-NEXT: movq 64(%rdi), %rbx +; SSE-NEXT: movq 72(%rdi), %r11 +; SSE-NEXT: movq 80(%rdi), %r12 +; SSE-NEXT: movq 88(%rdi), %r14 +; SSE-NEXT: movq 96(%rdi), %r13 +; SSE-NEXT: movq 104(%rdi), %r9 +; SSE-NEXT: movq 112(%rdi), %r10 +; SSE-NEXT: movq 120(%rdi), %r8 +; SSE-NEXT: bsrq %r8, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r10, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: orl $64, %ecx +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: bsrq %r9, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: bsrq %r13, %rax +; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %r10, %rdx +; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: orq %r8, %rdx +; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: bsrq %r14, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: movq %r12, %rsi +; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: bsrq %r12, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovnel %ecx, %edx +; SSE-NEXT: bsrq %r11, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: bsrq %rbx, %r15 +; SSE-NEXT: xorl $63, %r15d +; SSE-NEXT: orl $64, %r15d +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %ecx, %r15d +; SSE-NEXT: movq 48(%rdi), %r12 +; SSE-NEXT: subl $-128, %r15d +; SSE-NEXT: movq %rsi, %rcx +; SSE-NEXT: orq %r14, %rcx +; SSE-NEXT: cmovnel %edx, %r15d +; SSE-NEXT: addl $256, %r15d # imm = 0x100 +; SSE-NEXT: movq %r9, %rcx +; SSE-NEXT: orq %r8, %rcx +; SSE-NEXT: movq %r13, %rdx +; SSE-NEXT: orq %r10, %rdx +; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: movq 56(%rdi), %r13 +; SSE-NEXT: cmovnel %eax, %r15d +; SSE-NEXT: bsrq %r13, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: bsrq %r12, %rdx +; SSE-NEXT: xorl $63, %edx +; SSE-NEXT: orl $64, %edx +; SSE-NEXT: testq %r13, %r13 +; SSE-NEXT: cmovnel %eax, %edx +; SSE-NEXT: movq %rbp, %r10 +; SSE-NEXT: bsrq %rbp, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: movq 32(%rdi), %r8 +; SSE-NEXT: bsrq %r8, %rbp +; SSE-NEXT: xorl $63, %ebp +; SSE-NEXT: orl $64, %ebp +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %eax, %ebp +; SSE-NEXT: subl $-128, %ebp +; SSE-NEXT: movq %r12, %rax +; SSE-NEXT: orq %r13, %rax +; SSE-NEXT: cmovnel %edx, %ebp +; SSE-NEXT: movq 24(%rdi), %r9 +; SSE-NEXT: bsrq %r9, %rax +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: movq 16(%rdi), %rsi +; SSE-NEXT: bsrq %rsi, %rcx +; SSE-NEXT: xorl $63, %ecx +; SSE-NEXT: orl $64, %ecx +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: movq 8(%rdi), %rdx +; SSE-NEXT: bsrq (%rdi), %rax +; SSE-NEXT: bsrq %rdx, %rdi +; SSE-NEXT: xorl $63, %edi +; SSE-NEXT: xorl $63, %eax +; SSE-NEXT: orl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %rsi +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: orq %r13, %r10 +; SSE-NEXT: orq %r12, %r8 +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r10, %r8 +; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; SSE-NEXT: orq %r14, %r11 +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; SSE-NEXT: orq %rcx, %rbx +; SSE-NEXT: addl $512, %eax # imm = 0x200 +; SSE-NEXT: orq %r11, %rbx +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: load_ctlz_undef_i1024: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 48(%rdi), %r9 +; AVX2-NEXT: movq 56(%rdi), %rbp +; AVX2-NEXT: movq 64(%rdi), %r11 +; AVX2-NEXT: movq 72(%rdi), %r10 +; AVX2-NEXT: movq 80(%rdi), %r14 +; AVX2-NEXT: movq 88(%rdi), %rbx +; AVX2-NEXT: movq 96(%rdi), %rdx +; AVX2-NEXT: movq 104(%rdi), %r8 +; AVX2-NEXT: movq 112(%rdi), %rsi +; AVX2-NEXT: movq 120(%rdi), %r15 +; AVX2-NEXT: lzcntq %r15, %rax +; AVX2-NEXT: lzcntq %rsi, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: testq %r15, %r15 +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: lzcntq %r8, %r12 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdx, %rax +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: movq %rsi, %r12 +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: orq %r15, %r12 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %rbx, %rcx +; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: lzcntq %r14, %r13 +; AVX2-NEXT: addl $64, %r13d +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %ecx, %r13d +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r10, %rcx +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: lzcntq %r11, %r12 +; AVX2-NEXT: addl $64, %r12d +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %ecx, %r12d +; AVX2-NEXT: subl $-128, %r12d +; AVX2-NEXT: movq %r14, %rcx +; AVX2-NEXT: orq %rbx, %rcx +; AVX2-NEXT: cmovnel %r13d, %r12d +; AVX2-NEXT: addl $256, %r12d # imm = 0x100 +; AVX2-NEXT: movq %r8, %rcx +; AVX2-NEXT: orq %r15, %rcx +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %eax, %r12d +; AVX2-NEXT: movq %rbp, %r14 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %rbp, %rcx +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %r9, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rbp, %rbp +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: movq 32(%rdi), %r13 +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: lzcntq %r13, %rbp +; AVX2-NEXT: addl $64, %ebp +; AVX2-NEXT: movq 40(%rdi), %r8 +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: lzcntq %r8, %rdx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %edx, %ebp +; AVX2-NEXT: subl $-128, %ebp +; AVX2-NEXT: movq %r9, %rdx +; AVX2-NEXT: orq %r14, %rdx +; AVX2-NEXT: cmovnel %eax, %ebp +; AVX2-NEXT: movq 16(%rdi), %r9 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: lzcntq %r9, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: movq 24(%rdi), %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq %rdx, %rax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: movq 8(%rdi), %rsi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: lzcntq (%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: lzcntq %rsi, %rdi +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %r9 +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: orq %r14, %r8 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r8, %r13 +; AVX2-NEXT: cmovnel %ebp, %eax +; AVX2-NEXT: orq %r15, %rbx +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; AVX2-NEXT: orq %rbx, %r10 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX2-NEXT: orq %rcx, %r11 +; AVX2-NEXT: addl $512, %eax # imm = 0x200 +; AVX2-NEXT: orq %r10, %r11 +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_ctlz_undef_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq 80(%rdi), %rsi +; AVX512F-NEXT: movq 64(%rdi), %rcx +; AVX512F-NEXT: movq 72(%rdi), %rdx +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512F-NEXT: movq 88(%rdi), %r8 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512F-NEXT: vmovd %xmm1, %r9d +; AVX512F-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: orq 120(%rdi), %r8 +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq 104(%rdi), %rdx +; AVX512F-NEXT: orq %r8, %rdx +; AVX512F-NEXT: orq 112(%rdi), %rsi +; AVX512F-NEXT: orq 96(%rdi), %rcx +; AVX512F-NEXT: orq %rsi, %rcx +; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: cmovnel %r9d, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_ctlz_undef_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq 80(%rdi), %rsi +; AVX512VL-NEXT: movq 64(%rdi), %rcx +; AVX512VL-NEXT: movq 72(%rdi), %rdx +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512VL-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512VL-NEXT: movq 88(%rdi), %r8 +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm1, %r9d +; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512VL-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VL-NEXT: orq 120(%rdi), %r8 +; AVX512VL-NEXT: orq 104(%rdi), %rdx +; AVX512VL-NEXT: orq 112(%rdi), %rsi +; AVX512VL-NEXT: orq %r8, %rdx +; AVX512VL-NEXT: orq 96(%rdi), %rcx +; AVX512VL-NEXT: orq %rsi, %rcx +; AVX512VL-NEXT: orq %rdx, %rcx +; AVX512VL-NEXT: cmovnel %r9d, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_ctlz_undef_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: movq 80(%rdi), %rsi +; AVX512POPCNT-NEXT: movq 64(%rdi), %rcx +; AVX512POPCNT-NEXT: movq 72(%rdi), %rdx +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512POPCNT-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512POPCNT-NEXT: movq 88(%rdi), %r8 +; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm1, %r9d +; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq 120(%rdi), %r8 +; AVX512POPCNT-NEXT: orq 104(%rdi), %rdx +; AVX512POPCNT-NEXT: orq 112(%rdi), %rsi +; AVX512POPCNT-NEXT: orq %r8, %rdx +; AVX512POPCNT-NEXT: orq 96(%rdi), %rcx +; AVX512POPCNT-NEXT: orq %rsi, %rcx +; AVX512POPCNT-NEXT: orq %rdx, %rcx +; AVX512POPCNT-NEXT: cmovnel %r9d, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i1024, ptr %p0 - %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0) + %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1) %res = trunc i1024 %cnt to i32 ret i32 %res } @@ -1886,6 +4634,49 @@ define i32 @load_cttz_i128(ptr %p0) nounwind { ret i32 %res } +define i32 @vector_cttz_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_i128: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %xmm0, %rdx +; SSE-NEXT: rep bsfq %rdx, %rsi +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: vector_cttz_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: tzcntq %rcx, %rdx +; AVX512-NEXT: tzcntq %rax, %rax +; AVX512-NEXT: addl $64, %eax +; AVX512-NEXT: testq %rcx, %rcx +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + define i32 @test_cttz_i256(i256 %a0) nounwind { ; SSE-LABEL: test_cttz_i256: ; SSE: # %bb.0: @@ -1992,32 +4783,184 @@ define i32 @load_cttz_i256(ptr %p0) nounwind { ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_i256: -; AVX512: # %bb.0: -; AVX512-NEXT: movq 16(%rdi), %rcx -; AVX512-NEXT: movq (%rdi), %rdx -; AVX512-NEXT: movq 8(%rdi), %rsi -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: tzcntq %rsi, %r8 -; AVX512-NEXT: addl $64, %r8d -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %r8d -; AVX512-NEXT: tzcntq %rcx, %r9 -; AVX512-NEXT: tzcntq 24(%rdi), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %r9d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: cmovnel %r8d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; AVX512F-LABEL: load_cttz_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256] +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpandn %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vplzcntq %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [64,128,192,256] +; AVX512F-NEXT: vpsubq %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cttz_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256] +; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] +; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i256, ptr %p0 %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0) %res = trunc i256 %cnt to i32 ret i32 %res } +define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_i256: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rdx +; SSE-NEXT: rep bsfq %rdx, %rsi +; SSE-NEXT: rep bsfq %rax, %rdi +; SSE-NEXT: addl $64, %edi +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %edi +; SSE-NEXT: movq %xmm1, %rdx +; SSE-NEXT: rep bsfq %rdx, %rsi +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: tzcntq %rsi, %rdi +; AVX2-NEXT: tzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: tzcntq %rcx, %rdi +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_cttz_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vmovq %xmm1, %rcx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: tzcntq %rsi, %rdi +; AVX512F-NEXT: tzcntq %rdx, %r8 +; AVX512F-NEXT: addl $64, %r8d +; AVX512F-NEXT: testq %rsi, %rsi +; AVX512F-NEXT: cmovnel %edi, %r8d +; AVX512F-NEXT: tzcntq %rcx, %rdi +; AVX512F-NEXT: tzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edi, %eax +; AVX512F-NEXT: subl $-128, %eax +; AVX512F-NEXT: orq %rdx, %rsi +; AVX512F-NEXT: cmovnel %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_cttz_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vmovq %xmm1, %rcx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512VL-NEXT: vmovq %xmm0, %rsi +; AVX512VL-NEXT: tzcntq %rsi, %rdi +; AVX512VL-NEXT: tzcntq %rdx, %r8 +; AVX512VL-NEXT: addl $64, %r8d +; AVX512VL-NEXT: testq %rsi, %rsi +; AVX512VL-NEXT: cmovnel %edi, %r8d +; AVX512VL-NEXT: tzcntq %rcx, %rdi +; AVX512VL-NEXT: tzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edi, %eax +; AVX512VL-NEXT: subl $-128, %eax +; AVX512VL-NEXT: orq %rdx, %rsi +; AVX512VL-NEXT: cmovnel %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_cttz_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax +; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi +; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi +; AVX512POPCNT-NEXT: tzcntq %rdx, %r8 +; AVX512POPCNT-NEXT: addl $64, %r8d +; AVX512POPCNT-NEXT: testq %rsi, %rsi +; AVX512POPCNT-NEXT: cmovnel %edi, %r8d +; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi +; AVX512POPCNT-NEXT: tzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edi, %eax +; AVX512POPCNT-NEXT: subl $-128, %eax +; AVX512POPCNT-NEXT: orq %rdx, %rsi +; AVX512POPCNT-NEXT: cmovnel %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + define i32 @test_cttz_i512(i512 %a0) nounwind { ; SSE-LABEL: test_cttz_i512: ; SSE: # %bb.0: @@ -2109,47 +5052,84 @@ define i32 @test_cttz_i512(i512 %a0) nounwind { ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: tzcntq %rdi, %rax -; AVX512-NEXT: tzcntq %rsi, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %rdi, %rdi -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: tzcntq %rcx, %r10 -; AVX512-NEXT: addl $64, %r10d -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %r10d -; AVX512-NEXT: subl $-128, %r10d -; AVX512-NEXT: movq %rdi, %rax -; AVX512-NEXT: orq %rsi, %rax -; AVX512-NEXT: cmovnel %ebx, %r10d -; AVX512-NEXT: tzcntq %r8, %rax -; AVX512-NEXT: tzcntq %r9, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: tzcntq %r11, %r14 -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: cmovnel %r14d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r9, %r8 -; AVX512-NEXT: cmovnel %ebx, %eax -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: orq %rdx, %rdi -; AVX512-NEXT: orq %rsi, %rdi -; AVX512-NEXT: cmovnel %r10d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cttz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r9, %xmm1 +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_cttz_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %rcx, %xmm0 +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rsi, %xmm1 +; AVX512VL-NEXT: vmovq %rdi, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vmovq %r9, %xmm1 +; AVX512VL-NEXT: vmovq %r8, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_cttz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm1 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0) %res = trunc i512 %cnt to i32 ret i32 %res @@ -2263,59 +5243,199 @@ define i32 @load_cttz_i512(ptr %p0) nounwind { ; AVX2-NEXT: popq %r15 ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 48(%rdi), %r11 -; AVX512-NEXT: movq 40(%rdi), %r9 -; AVX512-NEXT: movq 32(%rdi), %r10 -; AVX512-NEXT: movq 24(%rdi), %r8 -; AVX512-NEXT: movq 16(%rdi), %rdx -; AVX512-NEXT: movq (%rdi), %rcx -; AVX512-NEXT: movq 8(%rdi), %rsi -; AVX512-NEXT: tzcntq %rcx, %rax -; AVX512-NEXT: tzcntq %rsi, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: tzcntq %r8, %rbx -; AVX512-NEXT: addl $64, %ebx -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ebx -; AVX512-NEXT: subl $-128, %ebx -; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: orq %rsi, %rax -; AVX512-NEXT: cmovnel %r14d, %ebx -; AVX512-NEXT: tzcntq %r10, %rax -; AVX512-NEXT: tzcntq %r9, %r14 -; AVX512-NEXT: addl $64, %r14d -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %eax, %r14d -; AVX512-NEXT: tzcntq 56(%rdi), %rax -; AVX512-NEXT: tzcntq %r11, %rdi -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r11, %r11 -; AVX512-NEXT: cmovnel %edi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r9, %r10 -; AVX512-NEXT: cmovnel %r14d, %eax -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: orq %rsi, %rcx -; AVX512-NEXT: cmovnel %ebx, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_cttz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cttz_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i512, ptr %p0 %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0) %res = trunc i512 %cnt to i32 ret i32 %res } +define i32 @vector_cttz_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_i512: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm3, %rdx +; SSE-NEXT: movq %xmm3, %rcx +; SSE-NEXT: pextrq $1, %xmm2, %rax +; SSE-NEXT: pextrq $1, %xmm1, %rsi +; SSE-NEXT: movq %xmm1, %rdi +; SSE-NEXT: pextrq $1, %xmm0, %r8 +; SSE-NEXT: movq %xmm0, %r9 +; SSE-NEXT: rep bsfq %r9, %r10 +; SSE-NEXT: rep bsfq %r8, %r8 +; SSE-NEXT: addl $64, %r8d +; SSE-NEXT: testq %r9, %r9 +; SSE-NEXT: cmovnel %r10d, %r8d +; SSE-NEXT: rep bsfq %rdi, %r9 +; SSE-NEXT: rep bsfq %rsi, %rsi +; SSE-NEXT: addl $64, %esi +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %r9d, %esi +; SSE-NEXT: movq %xmm2, %rdi +; SSE-NEXT: subl $-128, %esi +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %r8d, %esi +; SSE-NEXT: rep bsfq %rdi, %r8 +; SSE-NEXT: rep bsfq %rax, %r9 +; SSE-NEXT: addl $64, %r9d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %r8d, %r9d +; SSE-NEXT: rep bsfq %rcx, %rdi +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm2, %xmm2 +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: vmovq %xmm0, %r9 +; AVX2-NEXT: tzcntq %r9, %r10 +; AVX2-NEXT: tzcntq %rdi, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %r10d, %r11d +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: tzcntq %r8, %r10 +; AVX2-NEXT: tzcntq %rsi, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %r10d, %esi +; AVX2-NEXT: subl $-128, %esi +; AVX2-NEXT: orq %rdi, %r9 +; AVX2-NEXT: cmovnel %r11d, %esi +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: tzcntq %rdx, %rdi +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: tzcntq %rcx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: vmovq %xmm2, %rdi +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: tzcntq %rdi, %r9 +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_cttz_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_cttz_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_cttz_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; SSE-LABEL: test_cttz_i1024: ; SSE: # %bb.0: @@ -2547,111 +5667,136 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_i1024: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq %r9, %r14 -; AVX512-NEXT: movq %r8, %r15 -; AVX512-NEXT: movq %rcx, %r11 -; AVX512-NEXT: movq %rdx, %r10 -; AVX512-NEXT: movq %rsi, %r9 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: tzcntq %rdi, %rax -; AVX512-NEXT: tzcntq %r9, %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %rdi, %rdi -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %rdx, %r13 -; AVX512-NEXT: tzcntq %r11, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %r13d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %rdi, %r13 -; AVX512-NEXT: orq %r9, %r13 -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: tzcntq %r8, %r12 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: tzcntq %r14, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %r12d, %r13d -; AVX512-NEXT: tzcntq %rcx, %rbp -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %ebp, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %r8, %rbp -; AVX512-NEXT: orq %r14, %rbp -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r9, %r13 -; AVX512-NEXT: orq %r11, %r13 -; AVX512-NEXT: movq %rdi, %rbp -; AVX512-NEXT: orq %rdx, %rbp -; AVX512-NEXT: orq %r13, %rbp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %rbx, %rbp -; AVX512-NEXT: tzcntq %r13, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: tzcntq %rsi, %rcx -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %rbx, %rcx -; AVX512-NEXT: orq %r13, %rcx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512-NEXT: tzcntq %r14, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: tzcntq %r8, %rsi -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %esi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r13, %rbx -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: orq %r15, %rdi -; AVX512-NEXT: orq %r10, %rdi -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r9, %rdi -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cttz_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r9, %xmm1 +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %r10d +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpsubq %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq %r9, %rsi +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512F-NEXT: orq %rsi, %rcx +; AVX512F-NEXT: orq %r8, %rdi +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512F-NEXT: orq %rdi, %rdx +; AVX512F-NEXT: orq %rcx, %rdx +; AVX512F-NEXT: cmovnel %r10d, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_cttz_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %rcx, %xmm0 +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rsi, %xmm1 +; AVX512VL-NEXT: vmovq %rdi, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vmovq %r9, %xmm2 +; AVX512VL-NEXT: vmovq %r8, %xmm3 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512VL-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %r10d +; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VL-NEXT: vpsubq %zmm0, %zmm4, %zmm0 +; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VL-NEXT: orq %r9, %rsi +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512VL-NEXT: orq %rsi, %rcx +; AVX512VL-NEXT: orq %r8, %rdi +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512VL-NEXT: orq %rdi, %rdx +; AVX512VL-NEXT: orq %rcx, %rdx +; AVX512VL-NEXT: cmovnel %r10d, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_cttz_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vmovq %r9, %xmm2 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm3 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %r10d +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm1, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq %r9, %rsi +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512POPCNT-NEXT: orq %rsi, %rcx +; AVX512POPCNT-NEXT: orq %r8, %rdi +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512POPCNT-NEXT: orq %rdi, %rdx +; AVX512POPCNT-NEXT: orq %rcx, %rdx +; AVX512POPCNT-NEXT: cmovnel %r10d, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) %res = trunc i1024 %cnt to i32 ret i32 %res @@ -2900,122 +6045,1693 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_i1024: +; AVX512F-LABEL: load_cttz_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512F-NEXT: movq 16(%rdi), %rax +; AVX512F-NEXT: movq (%rdi), %rcx +; AVX512F-NEXT: movq 8(%rdi), %rdx +; AVX512F-NEXT: movq 24(%rdi), %rsi +; AVX512F-NEXT: orq 56(%rdi), %rsi +; AVX512F-NEXT: orq 40(%rdi), %rdx +; AVX512F-NEXT: orq 48(%rdi), %rax +; AVX512F-NEXT: orq 32(%rdi), %rcx +; AVX512F-NEXT: orq %rsi, %rdx +; AVX512F-NEXT: orq %rax, %rcx +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm1, %esi +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vpsubq %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: cmovnel %esi, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cttz_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512VL-NEXT: movq 16(%rdi), %rax +; AVX512VL-NEXT: movq (%rdi), %rcx +; AVX512VL-NEXT: movq 8(%rdi), %rdx +; AVX512VL-NEXT: movq 24(%rdi), %rsi +; AVX512VL-NEXT: orq 56(%rdi), %rsi +; AVX512VL-NEXT: orq 40(%rdi), %rdx +; AVX512VL-NEXT: orq %rsi, %rdx +; AVX512VL-NEXT: orq 48(%rdi), %rax +; AVX512VL-NEXT: orq 32(%rdi), %rcx +; AVX512VL-NEXT: orq %rax, %rcx +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512VL-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VL-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm1, %esi +; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vpsubq %zmm1, %zmm4, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VL-NEXT: orq %rdx, %rcx +; AVX512VL-NEXT: cmovnel %esi, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512POPCNT-NEXT: movq 16(%rdi), %rax +; AVX512POPCNT-NEXT: movq (%rdi), %rcx +; AVX512POPCNT-NEXT: movq 8(%rdi), %rdx +; AVX512POPCNT-NEXT: movq 24(%rdi), %rsi +; AVX512POPCNT-NEXT: orq 56(%rdi), %rsi +; AVX512POPCNT-NEXT: orq 40(%rdi), %rdx +; AVX512POPCNT-NEXT: orq %rsi, %rdx +; AVX512POPCNT-NEXT: orq 48(%rdi), %rax +; AVX512POPCNT-NEXT: orq 32(%rdi), %rcx +; AVX512POPCNT-NEXT: orq %rax, %rcx +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm1, %esi +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq %rdx, %rcx +; AVX512POPCNT-NEXT: cmovnel %esi, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i1024, ptr %p0 + %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) + %res = trunc i1024 %cnt to i32 + ret i32 %res +} + +; +; CTTZ_ZERO_UNDEF +; + +define i32 @test_cttz_undef_i128(i128 %a0) nounwind { +; SSE-LABEL: test_cttz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: rep bsfq %rdi, %rcx +; SSE-NEXT: rep bsfq %rsi, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: tzcntq %rdi, %rcx +; AVX2-NEXT: tzcntq %rsi, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_cttz_undef_i128: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 88(%rdi), %rbp -; AVX512-NEXT: movq 72(%rdi), %r15 -; AVX512-NEXT: movq 56(%rdi), %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %rcx -; AVX512-NEXT: movq 40(%rdi), %r10 -; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 32(%rdi), %rsi -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq 16(%rdi), %rbx -; AVX512-NEXT: movq (%rdi), %r8 -; AVX512-NEXT: movq 8(%rdi), %r11 -; AVX512-NEXT: tzcntq %r8, %rax -; AVX512-NEXT: tzcntq %r11, %rdx -; AVX512-NEXT: addl $64, %edx -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %eax, %edx -; AVX512-NEXT: tzcntq %rbx, %r12 -; AVX512-NEXT: tzcntq %r14, %rax -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: tzcntq %rdi, %rcx +; AVX512-NEXT: tzcntq %rsi, %rax ; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %r8, %r12 -; AVX512-NEXT: orq %r11, %r12 -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: tzcntq %rsi, %rdx -; AVX512-NEXT: tzcntq %r10, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: cmovnel %edx, %r13d -; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: cmovnel %ecx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_undef_i128(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: rep bsfq %rcx, %rdx +; SSE-NEXT: rep bsfq 8(%rdi), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: tzcntq 8(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: load_cttz_undef_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rcx ; AVX512-NEXT: tzcntq %rcx, %rdx -; AVX512-NEXT: tzcntq %r9, %r12 -; AVX512-NEXT: addl $64, %r12d +; AVX512-NEXT: tzcntq 8(%rdi), %rax +; AVX512-NEXT: addl $64, %eax ; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %edx, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %rsi, %rdx -; AVX512-NEXT: orq %r10, %rdx -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r11, %rdx -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: movq %r8, %r13 -; AVX512-NEXT: orq %rbx, %r13 -; AVX512-NEXT: orq %rdx, %r13 -; AVX512-NEXT: movq 64(%rdi), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %r13, %rdx -; AVX512-NEXT: tzcntq %r15, %rax +; AVX512-NEXT: cmovnel %edx, %eax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %a0 = load i128, ptr %p0 + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_undef_i128: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rcx +; SSE-NEXT: rep bsfq %rcx, %rdx +; SSE-NEXT: rep bsfq %rax, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_undef_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: vector_cttz_undef_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: tzcntq %rcx, %rdx +; AVX512-NEXT: tzcntq %rax, %rax ; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r13, %r13 +; AVX512-NEXT: testq %rcx, %rcx ; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: movq %rbp, %r14 -; AVX512-NEXT: tzcntq %rbp, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq 80(%rdi), %r10 -; AVX512-NEXT: tzcntq %r10, %rcx -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %r13, %rcx -; AVX512-NEXT: orq %r15, %rcx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq 104(%rdi), %r9 -; AVX512-NEXT: tzcntq %r9, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq 96(%rdi), %rdx -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq 112(%rdi), %rsi -; AVX512-NEXT: tzcntq 120(%rdi), %rax +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: retq + %a0 = bitcast <4 x i32> %v0 to i128 + %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1) + %res = trunc i128 %cnt to i32 + ret i32 %res +} + +define i32 @test_cttz_undef_i256(i256 %a0) nounwind { +; SSE-LABEL: test_cttz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: rep bsfq %rdi, %rax +; SSE-NEXT: rep bsfq %rsi, %r8 +; SSE-NEXT: addl $64, %r8d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %eax, %r8d +; SSE-NEXT: rep bsfq %rdx, %r9 +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rsi, %rdi +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: tzcntq %rdi, %rax +; AVX2-NEXT: tzcntq %rsi, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %eax, %r8d +; AVX2-NEXT: tzcntq %rdx, %r9 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_cttz_undef_i256: +; AVX512: # %bb.0: +; AVX512-NEXT: tzcntq %rdi, %rax +; AVX512-NEXT: tzcntq %rsi, %r8 +; AVX512-NEXT: addl $64, %r8d +; AVX512-NEXT: testq %rdi, %rdi +; AVX512-NEXT: cmovnel %eax, %r8d +; AVX512-NEXT: tzcntq %rdx, %r9 +; AVX512-NEXT: tzcntq %rcx, %rax ; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: tzcntq %rsi, %rdi -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %edi, %eax +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovnel %r9d, %eax ; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r9, %rdx -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq %r14, %r15 -; AVX512-NEXT: orq %r10, %r13 -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r15, %r13 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; AVX512-NEXT: orq %rcx, %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: orq %rbx, %r8 -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: cmovnel %r12d, %eax +; AVX512-NEXT: orq %rsi, %rdi +; AVX512-NEXT: cmovnel %r8d, %eax ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_undef_i256(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: movq 16(%rdi), %rcx +; SSE-NEXT: movq (%rdi), %rdx +; SSE-NEXT: movq 8(%rdi), %rsi +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %rsi, %r8 +; SSE-NEXT: addl $64, %r8d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %r8d +; SSE-NEXT: rep bsfq %rcx, %r9 +; SSE-NEXT: rep bsfq 24(%rdi), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %r9d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %rsi, %rdx +; SSE-NEXT: cmovnel %r8d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 8(%rdi), %rdx +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: tzcntq %rdx, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %esi +; AVX2-NEXT: movq 16(%rdi), %r8 +; AVX2-NEXT: tzcntq %r8, %r9 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 24(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cttz_undef_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256] +; AVX512F-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cttz_undef_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256] +; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1 +; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_undef_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1 +; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i256, ptr %p0 + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_undef_i256: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm1, %rax +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: pextrq $1, %xmm0, %rdx +; SSE-NEXT: movq %xmm0, %rsi +; SSE-NEXT: rep bsfq %rsi, %rdi +; SSE-NEXT: rep bsfq %rdx, %rdx +; SSE-NEXT: addl $64, %edx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %edx +; SSE-NEXT: rep bsfq %rcx, %rsi +; SSE-NEXT: rep bsfq %rax, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_undef_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm0, %rsi +; AVX2-NEXT: tzcntq %rsi, %rdi +; AVX2-NEXT: tzcntq %rdx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: tzcntq %rcx, %rdi +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_cttz_undef_i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vmovq %xmm1, %rcx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512F-NEXT: vmovq %xmm0, %rsi +; AVX512F-NEXT: tzcntq %rsi, %rdi +; AVX512F-NEXT: tzcntq %rdx, %r8 +; AVX512F-NEXT: addl $64, %r8d +; AVX512F-NEXT: testq %rsi, %rsi +; AVX512F-NEXT: cmovnel %edi, %r8d +; AVX512F-NEXT: tzcntq %rcx, %rdi +; AVX512F-NEXT: tzcntq %rax, %rax +; AVX512F-NEXT: addl $64, %eax +; AVX512F-NEXT: testq %rcx, %rcx +; AVX512F-NEXT: cmovnel %edi, %eax +; AVX512F-NEXT: subl $-128, %eax +; AVX512F-NEXT: orq %rdx, %rsi +; AVX512F-NEXT: cmovnel %r8d, %eax +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_cttz_undef_i256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vmovq %xmm1, %rcx +; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512VL-NEXT: vmovq %xmm0, %rsi +; AVX512VL-NEXT: tzcntq %rsi, %rdi +; AVX512VL-NEXT: tzcntq %rdx, %r8 +; AVX512VL-NEXT: addl $64, %r8d +; AVX512VL-NEXT: testq %rsi, %rsi +; AVX512VL-NEXT: cmovnel %edi, %r8d +; AVX512VL-NEXT: tzcntq %rcx, %rdi +; AVX512VL-NEXT: tzcntq %rax, %rax +; AVX512VL-NEXT: addl $64, %eax +; AVX512VL-NEXT: testq %rcx, %rcx +; AVX512VL-NEXT: cmovnel %edi, %eax +; AVX512VL-NEXT: subl $-128, %eax +; AVX512VL-NEXT: orq %rdx, %rsi +; AVX512VL-NEXT: cmovnel %r8d, %eax +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_cttz_undef_i256: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax +; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx +; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi +; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi +; AVX512POPCNT-NEXT: tzcntq %rdx, %r8 +; AVX512POPCNT-NEXT: addl $64, %r8d +; AVX512POPCNT-NEXT: testq %rsi, %rsi +; AVX512POPCNT-NEXT: cmovnel %edi, %r8d +; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi +; AVX512POPCNT-NEXT: tzcntq %rax, %rax +; AVX512POPCNT-NEXT: addl $64, %eax +; AVX512POPCNT-NEXT: testq %rcx, %rcx +; AVX512POPCNT-NEXT: cmovnel %edi, %eax +; AVX512POPCNT-NEXT: subl $-128, %eax +; AVX512POPCNT-NEXT: orq %rdx, %rsi +; AVX512POPCNT-NEXT: cmovnel %r8d, %eax +; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <8 x i32> %v0 to i256 + %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1) + %res = trunc i256 %cnt to i32 + ret i32 %res +} + +define i32 @test_cttz_undef_i512(i512 %a0) nounwind { +; SSE-LABEL: test_cttz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: rep bsfq %rdi, %rax +; SSE-NEXT: rep bsfq %rsi, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %rcx, %r10 +; SSE-NEXT: addl $64, %r10d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %r10d +; SSE-NEXT: subl $-128, %r10d +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: cmovnel %r11d, %r10d +; SSE-NEXT: rep bsfq %r8, %rax +; SSE-NEXT: rep bsfq %r9, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE-NEXT: rep bsfq %rbx, %r14 +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rbx, %rbx +; SSE-NEXT: cmovnel %r14d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %r8 +; SSE-NEXT: cmovnel %r11d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %rcx, %rsi +; SSE-NEXT: orq %rdx, %rdi +; SSE-NEXT: orq %rsi, %rdi +; SSE-NEXT: cmovnel %r10d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: tzcntq %rdi, %rax +; AVX2-NEXT: tzcntq %rsi, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %eax, %r11d +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: tzcntq %rcx, %r10 +; AVX2-NEXT: addl $64, %r10d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %r10d +; AVX2-NEXT: subl $-128, %r10d +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: orq %rsi, %rax +; AVX2-NEXT: cmovnel %r11d, %r10d +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r8, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %r9, %rbx +; AVX2-NEXT: addl $64, %ebx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %eax, %ebx +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: tzcntq %r11, %r14 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r11, %r11 +; AVX2-NEXT: cmovnel %r14d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %r9, %r8 +; AVX2-NEXT: cmovnel %ebx, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: cmovnel %r10d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_cttz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r9, %xmm1 +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_cttz_undef_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %rcx, %xmm0 +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rsi, %xmm1 +; AVX512VL-NEXT: vmovq %rdi, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vmovq %r9, %xmm1 +; AVX512VL-NEXT: vmovq %r8, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_cttz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vmovq %r9, %xmm2 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm3 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_undef_i512(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq 40(%rdi), %r9 +; SSE-NEXT: movq 24(%rdi), %r8 +; SSE-NEXT: movq 16(%rdi), %rdx +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %rsi +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: rep bsfq %rsi, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %r8, %r10 +; SSE-NEXT: addl $64, %r10d +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %r10d +; SSE-NEXT: movq 32(%rdi), %rbx +; SSE-NEXT: subl $-128, %r10d +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: cmovnel %r11d, %r10d +; SSE-NEXT: rep bsfq %rbx, %rax +; SSE-NEXT: rep bsfq %r9, %r11 +; SSE-NEXT: addl $64, %r11d +; SSE-NEXT: testq %rbx, %rbx +; SSE-NEXT: cmovnel %eax, %r11d +; SSE-NEXT: movq 48(%rdi), %r14 +; SSE-NEXT: rep bsfq %r14, %r15 +; SSE-NEXT: rep bsfq 56(%rdi), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %rbx +; SSE-NEXT: cmovnel %r11d, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r8, %rsi +; SSE-NEXT: orq %rdx, %rcx +; SSE-NEXT: orq %rsi, %rcx +; SSE-NEXT: cmovnel %r10d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 48(%rdi), %r10 +; AVX2-NEXT: movq 40(%rdi), %r9 +; AVX2-NEXT: movq 24(%rdi), %r8 +; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 8(%rdi), %rsi +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %rsi, %rbx +; AVX2-NEXT: addl $64, %ebx +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %eax, %ebx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: tzcntq %r8, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %r11d +; AVX2-NEXT: subl $-128, %r11d +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: orq %rsi, %rax +; AVX2-NEXT: cmovnel %ebx, %r11d +; AVX2-NEXT: movq 32(%rdi), %rbx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rbx, %rax +; AVX2-NEXT: xorl %r14d, %r14d +; AVX2-NEXT: tzcntq %r9, %r14 +; AVX2-NEXT: addl $64, %r14d +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %eax, %r14d +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %r10, %r15 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 56(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %r9, %rbx +; AVX2-NEXT: cmovnel %r14d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: cmovnel %r11d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cttz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cttz_undef_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = load i512, ptr %p0 + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind { +; SSE-LABEL: vector_cttz_undef_i512: +; SSE: # %bb.0: +; SSE-NEXT: pextrq $1, %xmm3, %rax +; SSE-NEXT: pextrq $1, %xmm2, %rdx +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: movq %xmm1, %rsi +; SSE-NEXT: pextrq $1, %xmm0, %rdi +; SSE-NEXT: movq %xmm0, %r8 +; SSE-NEXT: rep bsfq %r8, %r9 +; SSE-NEXT: rep bsfq %rdi, %rdi +; SSE-NEXT: addl $64, %edi +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %r9d, %edi +; SSE-NEXT: rep bsfq %rsi, %r8 +; SSE-NEXT: rep bsfq %rcx, %rcx +; SSE-NEXT: addl $64, %ecx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %r8d, %ecx +; SSE-NEXT: movq %xmm2, %rsi +; SSE-NEXT: subl $-128, %ecx +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %edi, %ecx +; SSE-NEXT: rep bsfq %rsi, %rdi +; SSE-NEXT: rep bsfq %rdx, %rdx +; SSE-NEXT: addl $64, %edx +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %edx +; SSE-NEXT: movq %xmm3, %rsi +; SSE-NEXT: rep bsfq %rsi, %rdi +; SSE-NEXT: rep bsfq %rax, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: cmovnel %edi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: ptest %xmm2, %xmm2 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: ptest %xmm0, %xmm0 +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: retq +; +; AVX2-LABEL: vector_cttz_undef_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %rdi +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: vmovq %xmm0, %r9 +; AVX2-NEXT: tzcntq %r9, %r10 +; AVX2-NEXT: tzcntq %rdi, %r11 +; AVX2-NEXT: addl $64, %r11d +; AVX2-NEXT: testq %r9, %r9 +; AVX2-NEXT: cmovnel %r10d, %r11d +; AVX2-NEXT: xorl %r10d, %r10d +; AVX2-NEXT: tzcntq %r8, %r10 +; AVX2-NEXT: tzcntq %rsi, %rsi +; AVX2-NEXT: addl $64, %esi +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %r10d, %esi +; AVX2-NEXT: subl $-128, %esi +; AVX2-NEXT: orq %rdi, %r9 +; AVX2-NEXT: cmovnel %r11d, %esi +; AVX2-NEXT: xorl %edi, %edi +; AVX2-NEXT: tzcntq %rdx, %rdi +; AVX2-NEXT: xorl %r8d, %r8d +; AVX2-NEXT: tzcntq %rcx, %r8 +; AVX2-NEXT: addl $64, %r8d +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %edi, %r8d +; AVX2-NEXT: vmovq %xmm2, %rdi +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: tzcntq %rdi, %r9 +; AVX2-NEXT: tzcntq %rax, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %r9d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: cmovnel %r8d, %eax +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vector_cttz_undef_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: vector_cttz_undef_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: vector_cttz_undef_i512: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %a0 = bitcast <16 x i32> %v0 to i512 + %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1) + %res = trunc i512 %cnt to i32 + ret i32 %res +} + +define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { +; SSE-LABEL: test_cttz_undef_i1024: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %r9, %r14 +; SSE-NEXT: movq %rcx, %rbx +; SSE-NEXT: movq %rdx, %r10 +; SSE-NEXT: movq %rsi, %r9 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: rep bsfq %rdi, %rax +; SSE-NEXT: rep bsfq %rsi, %r12 +; SSE-NEXT: addl $64, %r12d +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %eax, %r12d +; SSE-NEXT: rep bsfq %r10, %r15 +; SSE-NEXT: rep bsfq %rbx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %rdi, %r13 +; SSE-NEXT: orq %rsi, %r13 +; SSE-NEXT: cmovnel %r12d, %eax +; SSE-NEXT: movq %r8, %r15 +; SSE-NEXT: rep bsfq %r8, %r12 +; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: rep bsfq %r14, %r13 +; SSE-NEXT: addl $64, %r13d +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %r12d, %r13d +; SSE-NEXT: rep bsfq %rcx, %rbp +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r12 +; SSE-NEXT: addl $64, %r12d +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %ebp, %r12d +; SSE-NEXT: subl $-128, %r12d +; SSE-NEXT: movq %r8, %rbp +; SSE-NEXT: orq %r14, %rbp +; SSE-NEXT: cmovnel %r13d, %r12d +; SSE-NEXT: addl $256, %r12d # imm = 0x100 +; SSE-NEXT: movq %rsi, %r13 +; SSE-NEXT: orq %rbx, %r13 +; SSE-NEXT: movq %rdi, %rbp +; SSE-NEXT: orq %r10, %rbp +; SSE-NEXT: orq %r13, %rbp +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSE-NEXT: cmovnel %eax, %r12d +; SSE-NEXT: rep bsfq %r11, %rbp +; SSE-NEXT: rep bsfq %r13, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rbp +; SSE-NEXT: addl $64, %ebp +; SSE-NEXT: rep bsfq %rdx, %rcx +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %ecx, %ebp +; SSE-NEXT: subl $-128, %ebp +; SSE-NEXT: movq %r11, %rcx +; SSE-NEXT: orq %r13, %rcx +; SSE-NEXT: cmovnel %eax, %ebp +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE-NEXT: rep bsfq %r14, %rcx +; SSE-NEXT: addl $64, %ecx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE-NEXT: rep bsfq %r8, %rsi +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r14, %rdx +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %r13, %r11 +; SSE-NEXT: cmovnel %ebp, %eax +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; SSE-NEXT: orq %rbx, %r9 +; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: orq %r15, %rdi +; SSE-NEXT: orq %r10, %rdi +; SSE-NEXT: addl $512, %eax # imm = 0x200 +; SSE-NEXT: orq %r9, %rdi +; SSE-NEXT: cmovnel %r12d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: test_cttz_undef_i1024: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq %r9, %rbx +; AVX2-NEXT: movq %r8, %r14 +; AVX2-NEXT: movq %rcx, %r11 +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: movq %rsi, %r9 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: tzcntq %rdi, %rax +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %r9, %r15 +; AVX2-NEXT: addl $64, %r15d +; AVX2-NEXT: testq %rdi, %rdi +; AVX2-NEXT: cmovnel %eax, %r15d +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %r10, %r12 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r11, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: movq %rdi, %r12 +; AVX2-NEXT: orq %r9, %r12 +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %r14, %r15 +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %rbx, %r12 +; AVX2-NEXT: addl $64, %r12d +; AVX2-NEXT: testq %r14, %r14 +; AVX2-NEXT: cmovnel %r15d, %r12d +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: tzcntq %rcx, %r13 +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %rdx, %r15 +; AVX2-NEXT: addl $64, %r15d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %r13d, %r15d +; AVX2-NEXT: subl $-128, %r15d +; AVX2-NEXT: movq %r14, %r13 +; AVX2-NEXT: orq %rbx, %r13 +; AVX2-NEXT: cmovnel %r12d, %r15d +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: addl $256, %r15d # imm = 0x100 +; AVX2-NEXT: movq %r9, %r13 +; AVX2-NEXT: orq %r11, %r13 +; AVX2-NEXT: movq %rdi, %rbp +; AVX2-NEXT: orq %r10, %rbp +; AVX2-NEXT: orq %r13, %rbp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: cmovnel %eax, %r15d +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: tzcntq %r12, %rbp +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r13, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r12, %r12 +; AVX2-NEXT: cmovnel %ebp, %eax +; AVX2-NEXT: xorl %ebp, %ebp +; AVX2-NEXT: tzcntq %r8, %rbp +; AVX2-NEXT: addl $64, %ebp +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: tzcntq %rsi, %rcx +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %ecx, %ebp +; AVX2-NEXT: subl $-128, %ebp +; AVX2-NEXT: movq %r12, %rcx +; AVX2-NEXT: orq %r13, %rcx +; AVX2-NEXT: cmovnel %eax, %ebp +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: tzcntq %rbx, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: tzcntq %r8, %rsi +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %esi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %rbx, %rdx +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r12 +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r13, %r12 +; AVX2-NEXT: cmovnel %ebp, %eax +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; AVX2-NEXT: orq %r11, %r9 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: orq %r14, %rdi +; AVX2-NEXT: orq %r10, %rdi +; AVX2-NEXT: addl $512, %eax # imm = 0x200 +; AVX2-NEXT: orq %r9, %rdi +; AVX2-NEXT: cmovnel %r15d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_cttz_undef_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r9, %xmm1 +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %r10d +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpsubq %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq %r9, %rsi +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512F-NEXT: orq %rsi, %rcx +; AVX512F-NEXT: orq %r8, %rdi +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512F-NEXT: orq %rdi, %rdx +; AVX512F-NEXT: orq %rcx, %rdx +; AVX512F-NEXT: cmovnel %r10d, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_cttz_undef_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %rcx, %xmm0 +; AVX512VL-NEXT: vmovq %rdx, %xmm1 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vmovq %rsi, %xmm1 +; AVX512VL-NEXT: vmovq %rdi, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vmovq %r9, %xmm1 +; AVX512VL-NEXT: vmovq %r8, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512VL-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %r10d +; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512VL-NEXT: vpsubq %zmm0, %zmm4, %zmm0 +; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VL-NEXT: orq %r9, %rsi +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512VL-NEXT: orq %rsi, %rcx +; AVX512VL-NEXT: orq %r8, %rdi +; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512VL-NEXT: orq %rdi, %rdx +; AVX512VL-NEXT: orq %rcx, %rdx +; AVX512VL-NEXT: cmovnel %r10d, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: test_cttz_undef_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm1 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %r10d +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq %r9, %rsi +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512POPCNT-NEXT: orq %rsi, %rcx +; AVX512POPCNT-NEXT: orq %r8, %rdi +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512POPCNT-NEXT: orq %rdi, %rdx +; AVX512POPCNT-NEXT: orq %rcx, %rdx +; AVX512POPCNT-NEXT: cmovnel %r10d, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq + %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1) + %res = trunc i1024 %cnt to i32 + ret i32 %res +} + +define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { +; SSE-LABEL: load_cttz_undef_i1024: +; SSE: # %bb.0: +; SSE-NEXT: pushq %rbp +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %r13 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq 72(%rdi), %rbx +; SSE-NEXT: movq 56(%rdi), %r9 +; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 48(%rdi), %rcx +; SSE-NEXT: movq 40(%rdi), %r10 +; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: movq 32(%rdi), %rsi +; SSE-NEXT: movq 24(%rdi), %rbp +; SSE-NEXT: movq (%rdi), %r8 +; SSE-NEXT: movq 8(%rdi), %r11 +; SSE-NEXT: rep bsfq %r8, %rax +; SSE-NEXT: rep bsfq %r11, %rdx +; SSE-NEXT: addl $64, %edx +; SSE-NEXT: testq %r8, %r8 +; SSE-NEXT: cmovnel %eax, %edx +; SSE-NEXT: movq 16(%rdi), %r14 +; SSE-NEXT: rep bsfq %r14, %r15 +; SSE-NEXT: rep bsfq %rbp, %rax +; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: movq %r8, %r15 +; SSE-NEXT: orq %r11, %r15 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: rep bsfq %rsi, %rdx +; SSE-NEXT: rep bsfq %r10, %r13 +; SSE-NEXT: addl $64, %r13d +; SSE-NEXT: testq %rsi, %rsi +; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: cmovnel %edx, %r13d +; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT: rep bsfq %rcx, %rdx +; SSE-NEXT: rep bsfq %r9, %r15 +; SSE-NEXT: addl $64, %r15d +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovnel %edx, %r15d +; SSE-NEXT: movq 64(%rdi), %r12 +; SSE-NEXT: subl $-128, %r15d +; SSE-NEXT: movq %rsi, %rdx +; SSE-NEXT: orq %r10, %rdx +; SSE-NEXT: cmovnel %r13d, %r15d +; SSE-NEXT: addl $256, %r15d # imm = 0x100 +; SSE-NEXT: movq %r11, %rdx +; SSE-NEXT: orq %rbp, %rdx +; SSE-NEXT: movq %r8, %r13 +; SSE-NEXT: orq %r14, %r13 +; SSE-NEXT: orq %rdx, %r13 +; SSE-NEXT: cmovnel %eax, %r15d +; SSE-NEXT: rep bsfq %r12, %rdx +; SSE-NEXT: rep bsfq %rbx, %rax +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: testq %r12, %r12 +; SSE-NEXT: cmovnel %edx, %eax +; SSE-NEXT: movq 88(%rdi), %rbp +; SSE-NEXT: rep bsfq %rbp, %r13 +; SSE-NEXT: addl $64, %r13d +; SSE-NEXT: movq 80(%rdi), %r10 +; SSE-NEXT: rep bsfq %r10, %rcx +; SSE-NEXT: testq %r10, %r10 +; SSE-NEXT: cmovnel %ecx, %r13d +; SSE-NEXT: subl $-128, %r13d +; SSE-NEXT: movq %r12, %rcx +; SSE-NEXT: orq %rbx, %rcx +; SSE-NEXT: cmovnel %eax, %r13d +; SSE-NEXT: movq 104(%rdi), %r9 +; SSE-NEXT: rep bsfq %r9, %rcx +; SSE-NEXT: addl $64, %ecx +; SSE-NEXT: movq 96(%rdi), %rdx +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovnel %eax, %ecx +; SSE-NEXT: rep bsfq 120(%rdi), %rax +; SSE-NEXT: movq 112(%rdi), %rdi +; SSE-NEXT: addl $64, %eax +; SSE-NEXT: rep bsfq %rdi, %rsi +; SSE-NEXT: testq %rdi, %rdi +; SSE-NEXT: cmovnel %esi, %eax +; SSE-NEXT: subl $-128, %eax +; SSE-NEXT: orq %r9, %rdx +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: orq %rbp, %rbx +; SSE-NEXT: orq %r10, %r12 +; SSE-NEXT: addl $256, %eax # imm = 0x100 +; SSE-NEXT: orq %rbx, %r12 +; SSE-NEXT: cmovnel %r13d, %eax +; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; SSE-NEXT: orq %rcx, %r11 +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; SSE-NEXT: orq %r14, %r8 +; SSE-NEXT: addl $512, %eax # imm = 0x200 +; SSE-NEXT: orq %r11, %r8 +; SSE-NEXT: cmovnel %r15d, %eax +; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r12 +; SSE-NEXT: popq %r13 +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: popq %rbp +; SSE-NEXT: retq +; +; AVX2-LABEL: load_cttz_undef_i1024: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq 72(%rdi), %r14 +; AVX2-NEXT: movq 64(%rdi), %r15 +; AVX2-NEXT: movq 56(%rdi), %r9 +; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 48(%rdi), %rcx +; AVX2-NEXT: movq 40(%rdi), %r10 +; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq 32(%rdi), %rsi +; AVX2-NEXT: movq 24(%rdi), %rbp +; AVX2-NEXT: movq 16(%rdi), %rbx +; AVX2-NEXT: movq (%rdi), %r8 +; AVX2-NEXT: movq 8(%rdi), %r11 +; AVX2-NEXT: tzcntq %r8, %rax +; AVX2-NEXT: tzcntq %r11, %rdx +; AVX2-NEXT: addl $64, %edx +; AVX2-NEXT: testq %r8, %r8 +; AVX2-NEXT: cmovnel %eax, %edx +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %rbx, %r12 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rbp, %rax +; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %rbx, %rbx +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: movq %r8, %r12 +; AVX2-NEXT: orq %r11, %r12 +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: tzcntq %rsi, %rdx +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: tzcntq %r10, %r13 +; AVX2-NEXT: addl $64, %r13d +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: cmovnel %edx, %r13d +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: tzcntq %rcx, %rdx +; AVX2-NEXT: xorl %r12d, %r12d +; AVX2-NEXT: tzcntq %r9, %r12 +; AVX2-NEXT: addl $64, %r12d +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovnel %edx, %r12d +; AVX2-NEXT: subl $-128, %r12d +; AVX2-NEXT: movq %rsi, %rdx +; AVX2-NEXT: orq %r10, %rdx +; AVX2-NEXT: cmovnel %r13d, %r12d +; AVX2-NEXT: addl $256, %r12d # imm = 0x100 +; AVX2-NEXT: movq %r11, %rdx +; AVX2-NEXT: orq %rbp, %rdx +; AVX2-NEXT: movq %r8, %r13 +; AVX2-NEXT: orq %rbx, %r13 +; AVX2-NEXT: orq %rdx, %r13 +; AVX2-NEXT: cmovnel %eax, %r12d +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: tzcntq %r15, %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r14, %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: testq %r15, %r15 +; AVX2-NEXT: cmovnel %edx, %eax +; AVX2-NEXT: movq 88(%rdi), %rbp +; AVX2-NEXT: xorl %r13d, %r13d +; AVX2-NEXT: tzcntq %rbp, %r13 +; AVX2-NEXT: addl $64, %r13d +; AVX2-NEXT: movq 80(%rdi), %r10 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: tzcntq %r10, %rcx +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovnel %ecx, %r13d +; AVX2-NEXT: subl $-128, %r13d +; AVX2-NEXT: movq %r15, %rcx +; AVX2-NEXT: orq %r14, %rcx +; AVX2-NEXT: cmovnel %eax, %r13d +; AVX2-NEXT: movq 104(%rdi), %r9 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: tzcntq %r9, %rcx +; AVX2-NEXT: addl $64, %ecx +; AVX2-NEXT: movq 96(%rdi), %rdx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovnel %eax, %ecx +; AVX2-NEXT: movq 112(%rdi), %rsi +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 120(%rdi), %rax +; AVX2-NEXT: addl $64, %eax +; AVX2-NEXT: tzcntq %rsi, %rdi +; AVX2-NEXT: testq %rsi, %rsi +; AVX2-NEXT: cmovnel %edi, %eax +; AVX2-NEXT: subl $-128, %eax +; AVX2-NEXT: orq %r9, %rdx +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: orq %rbp, %r14 +; AVX2-NEXT: orq %r10, %r15 +; AVX2-NEXT: addl $256, %eax # imm = 0x100 +; AVX2-NEXT: orq %r14, %r15 +; AVX2-NEXT: cmovnel %r13d, %eax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; AVX2-NEXT: orq %rcx, %r11 +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; AVX2-NEXT: orq %rbx, %r8 +; AVX2-NEXT: addl $512, %eax # imm = 0x200 +; AVX2-NEXT: orq %r11, %r8 +; AVX2-NEXT: cmovnel %r12d, %eax +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: load_cttz_undef_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512F-NEXT: movq 16(%rdi), %rax +; AVX512F-NEXT: movq (%rdi), %rcx +; AVX512F-NEXT: movq 8(%rdi), %rdx +; AVX512F-NEXT: movq 24(%rdi), %rsi +; AVX512F-NEXT: orq 56(%rdi), %rsi +; AVX512F-NEXT: orq 40(%rdi), %rdx +; AVX512F-NEXT: orq 48(%rdi), %rax +; AVX512F-NEXT: orq %rsi, %rdx +; AVX512F-NEXT: orq 32(%rdi), %rcx +; AVX512F-NEXT: orq %rax, %rcx +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm1, %esi +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vpsubq %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: cmovnel %esi, %eax +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: load_cttz_undef_i1024: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512VL-NEXT: movq 16(%rdi), %rax +; AVX512VL-NEXT: movq (%rdi), %rcx +; AVX512VL-NEXT: movq 8(%rdi), %rdx +; AVX512VL-NEXT: movq 24(%rdi), %rsi +; AVX512VL-NEXT: orq 56(%rdi), %rsi +; AVX512VL-NEXT: orq 40(%rdi), %rdx +; AVX512VL-NEXT: orq 48(%rdi), %rax +; AVX512VL-NEXT: orq 32(%rdi), %rcx +; AVX512VL-NEXT: orq %rsi, %rdx +; AVX512VL-NEXT: orq %rax, %rcx +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512VL-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512VL-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm1, %esi +; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512VL-NEXT: vpsubq %zmm1, %zmm4, %zmm1 +; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: addl $512, %eax # imm = 0x200 +; AVX512VL-NEXT: orq %rdx, %rcx +; AVX512VL-NEXT: cmovnel %esi, %eax +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_undef_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512POPCNT-NEXT: movq 16(%rdi), %rax +; AVX512POPCNT-NEXT: movq (%rdi), %rcx +; AVX512POPCNT-NEXT: movq 8(%rdi), %rdx +; AVX512POPCNT-NEXT: movq 24(%rdi), %rsi +; AVX512POPCNT-NEXT: orq 56(%rdi), %rsi +; AVX512POPCNT-NEXT: orq 40(%rdi), %rdx +; AVX512POPCNT-NEXT: orq 48(%rdi), %rax +; AVX512POPCNT-NEXT: orq 32(%rdi), %rcx +; AVX512POPCNT-NEXT: orq %rsi, %rdx +; AVX512POPCNT-NEXT: orq %rax, %rcx +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm1, %esi +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq %rdx, %rcx +; AVX512POPCNT-NEXT: cmovnel %esi, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i1024, ptr %p0 - %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) + %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1) %res = trunc i1024 %cnt to i32 ret i32 %res } diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 19d751d1..023fb506 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE,SSE4 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512 @@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB5_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: .LBB5_2: -; X86-NEXT: andl 4(%eax), %esi -; X86-NEXT: andl (%eax), %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: setne %al -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $32, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: test_ne_i64: @@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB6_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB6_2: -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %esi, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl %eax, %ebp -; X86-NEXT: xorl %esi, %edi -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: setne %al -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i64: @@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB7_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: .LBB7_2: -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: movl 4(%edx), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: notl %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: andl %esi, %ebp -; X86-NEXT: notl %esi -; X86-NEXT: andl %ecx, %edi -; X86-NEXT: andl %eax, %esi -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: sete %al -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i64: @@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB8_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB8_2: -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %esi, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl %eax, %ebp -; X86-NEXT: orl %esi, %edi -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: setne %al -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i64: @@ -419,52 +353,26 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB9_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl $0, %eax -; X86-NEXT: .LBB9_2: -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: notl %ebp -; X86-NEXT: je .LBB9_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: .LBB9_4: +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl (%edi), %ecx -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: andl %ecx, %ebp -; X86-NEXT: orl %esi, %ebp -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %ebp, (%edi) -; X86-NEXT: movl %ebx, 4(%edi) -; X86-NEXT: sete %al +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i64: @@ -516,101 +424,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $48, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, (%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movl 24(%esp,%esi), %edi -; X86-NEXT: movl 28(%esp,%esi), %eax -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 16(%esp,%esi), %edx -; X86-NEXT: movl 20(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: andl 8(%ebx), %edi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: andl 12(%ebx), %eax -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $96, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; -; SSE-LABEL: test_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %rsi, %rax -; SSE-NEXT: andq 8(%rdi), %rdx -; SSE-NEXT: andq (%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: setne %al -; SSE-NEXT: retq -; -; AVX2-LABEL: test_ne_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: movl $1, %edx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rdx, %rsi -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rdx, %rsi -; AVX2-NEXT: cmovneq %rax, %rdx -; AVX2-NEXT: andq 8(%rdi), %rsi -; AVX2-NEXT: andq (%rdi), %rdx -; AVX2-NEXT: orq %rsi, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_ne_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shlxq %rcx, %rax, %rax -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rax, %rdx -; AVX512-NEXT: cmovneq %rsi, %rax -; AVX512-NEXT: andq 8(%rdi), %rdx -; AVX512-NEXT: andq (%rdi), %rax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; X64-LABEL: test_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $96, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -623,124 +455,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 8(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 12(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: complement_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: xorq %rcx, %rsi -; SSE-NEXT: xorq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: complement_ne_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: andq %rsi, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: andq %rdx, %r9 -; AVX-NEXT: xorq %rcx, %rsi -; AVX-NEXT: xorq %rax, %rdx -; AVX-NEXT: orq %r8, %r9 -; AVX-NEXT: setne %al -; AVX-NEXT: movq %rdx, (%rdi) -; AVX-NEXT: movq %rsi, 8(%rdi) -; AVX-NEXT: retq +; X64-LABEL: complement_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btcl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -755,124 +496,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %edx -; X86-NEXT: movl 60(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %esi -; X86-NEXT: movl 52(%esp,%eax), %edi -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl 8(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl (%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl 4(%ebx), %ebx -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl %ebx, %ecx -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: movl %edx, 8(%edi) -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl %esi, (%edi) -; X86-NEXT: movl %ecx, 4(%edi) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_eq_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: notq %rsi -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: notq %rdx -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: sete %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: reset_eq_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: andnq %rcx, %rsi, %r8 -; AVX-NEXT: andq %rsi, %rcx -; AVX-NEXT: andnq %rax, %rdx, %rsi -; AVX-NEXT: andq %rdx, %rax -; AVX-NEXT: orq %rcx, %rax -; AVX-NEXT: sete %al -; AVX-NEXT: movq %rsi, (%rdi) -; AVX-NEXT: movq %r8, 8(%rdi) -; AVX-NEXT: retq +; X64-LABEL: reset_eq_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setae %al +; X64-NEXT: btrl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -888,124 +538,33 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 8(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 12(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: set_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: orq %rcx, %rsi -; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: set_ne_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: andq %rsi, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: andq %rdx, %r9 -; AVX-NEXT: orq %rcx, %rsi -; AVX-NEXT: orq %rax, %rdx -; AVX-NEXT: orq %r8, %r9 -; AVX-NEXT: setne %al -; AVX-NEXT: movq %rdx, (%rdi) -; AVX-NEXT: movq %rsi, 8(%rdi) -; AVX-NEXT: retq +; X64-LABEL: set_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btsl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -1020,218 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $128, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrb $3, %dl -; X86-NEXT: andb $12, %dl -; X86-NEXT: negb %dl -; X86-NEXT: movsbl %dl, %esi -; X86-NEXT: movl 64(%esp,%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esp,%esi), %ebx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 76(%esp,%esi), %edi -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %edi, %esi -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl 12(%ecx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 100(%esp,%ecx), %edi -; X86-NEXT: movl 104(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 108(%esp,%ebx), %ebx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 96(%esp,%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %eax, (%ecx) -; X86-NEXT: movl %edx, 4(%ecx) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %r9d, %r9d -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %r9, %rax -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq 8(%rdi), %r9 -; SSE-NEXT: movq %r9, %r10 -; SSE-NEXT: andq %r8, %r10 -; SSE-NEXT: notq %r8 -; SSE-NEXT: movq %rcx, %r11 -; SSE-NEXT: andq %rsi, %r11 -; SSE-NEXT: notq %rsi -; SSE-NEXT: andq %r9, %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: orq %r10, %r11 -; SSE-NEXT: sete %al -; SSE-NEXT: movq %rsi, (%rdi) -; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: andl $96, %esi +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d +; SSE-NEXT: setae %al +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %esi -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: shlxq %rcx, %rsi, %rsi -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rsi, %rax -; AVX2-NEXT: cmovneq %r8, %rsi -; AVX2-NEXT: shlxq %rcx, %rdx, %rcx -; AVX2-NEXT: cmovneq %rcx, %r9 -; AVX2-NEXT: cmovneq %r8, %rcx -; AVX2-NEXT: movq (%rdi), %rdx -; AVX2-NEXT: movq 8(%rdi), %r8 -; AVX2-NEXT: andnq %r8, %rax, %r10 -; AVX2-NEXT: andq %rax, %r8 -; AVX2-NEXT: andnq %rdx, %rsi, %r11 -; AVX2-NEXT: andq %rsi, %rdx -; AVX2-NEXT: orq %r9, %r10 -; AVX2-NEXT: orq %rcx, %r11 -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: sete %al -; AVX2-NEXT: movq %r11, (%rdi) -; AVX2-NEXT: movq %r10, 8(%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movl $1, %esi -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: shlxq %rcx, %rsi, %rsi -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: xorl %r9d, %r9d -; AVX512-NEXT: shldq %cl, %rdx, %r9 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rsi, %r8 -; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %rdx, %rcx -; AVX512-NEXT: cmovneq %rcx, %r9 -; AVX512-NEXT: cmovneq %rax, %rcx -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rdx -; AVX512-NEXT: andnq %rdx, %r8, %r10 -; AVX512-NEXT: andq %r8, %rdx -; AVX512-NEXT: andnq %rax, %rsi, %r8 -; AVX512-NEXT: andq %rsi, %rax -; AVX512-NEXT: orq %r9, %r10 -; AVX512-NEXT: orq %rcx, %r8 -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: sete %al -; AVX512-NEXT: movq %r8, (%rdi) -; AVX512-NEXT: movq %r10, 8(%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: andl $96, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -1252,935 +648,317 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: andl $60, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + ret i1 %cmp +} + +define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_ne_i512: +; X86: # %bb.0: ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $224, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %eax -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl 40(%ebx), %eax -; X86-NEXT: andl 8(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 56(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 24(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: andl 44(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 12(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 60(%edi), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 28(%edi), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: negl %edx -; X86-NEXT: movl 192(%esp,%edx), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl 32(%ebx), %ecx -; X86-NEXT: andl (%ebx), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: andl 16(%ebx), %edi -; X86-NEXT: andl 48(%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 36(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 20(%ebx), %ecx -; X86-NEXT: andl 52(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: test_ne_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq -48(%rsp,%rbx), %rdx -; SSE-NEXT: movq -40(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq -16(%rsp,%rbx), %r11 -; SSE-NEXT: movq -8(%rsp,%rbx), %r10 -; SSE-NEXT: shldq %cl, %r11, %r10 -; SSE-NEXT: movq -32(%rsp,%rbx), %r9 -; SSE-NEXT: movq -24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r8 -; SSE-NEXT: shldq %cl, %r9, %r8 -; SSE-NEXT: movq -56(%rsp,%rbx), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rdx -; SSE-NEXT: shldq %cl, %r15, %r11 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -64(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %rsi -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: andq 32(%rdi), %r9 -; SSE-NEXT: andq 48(%rdi), %r11 -; SSE-NEXT: andq 16(%rdi), %rdx -; SSE-NEXT: orq %r11, %rdx -; SSE-NEXT: andq 40(%rdi), %r8 -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: andq (%rdi), %rbx -; SSE-NEXT: orq %r9, %rbx -; SSE-NEXT: orq %rdx, %rbx -; SSE-NEXT: andq 8(%rdi), %rsi -; SSE-NEXT: orq %r8, %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: orq %rbx, %rsi -; SSE-NEXT: setne %al -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: retq +; X64-LABEL: complement_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btcl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = xor i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: reset_eq_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl ; -; AVX2-LABEL: test_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rsi -; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx -; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx -; AVX2-NEXT: movq %rbx, %rax -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq -16(%rsp,%rsi), %r11 -; AVX2-NEXT: movq -8(%rsp,%rsi), %r10 -; AVX2-NEXT: shldq %cl, %r11, %r10 -; AVX2-NEXT: movq -32(%rsp,%rsi), %r9 -; AVX2-NEXT: movq -24(%rsp,%rsi), %r14 -; AVX2-NEXT: movq %r14, %r8 -; AVX2-NEXT: shldq %cl, %r9, %r8 -; AVX2-NEXT: movq -64(%rsp,%rsi), %r15 -; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: shldq %cl, %r14, %r11 -; AVX2-NEXT: shldq %cl, %rbx, %r9 -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: shlxq %rcx, %r15, %rcx -; AVX2-NEXT: andq 32(%rdi), %r9 -; AVX2-NEXT: andq 48(%rdi), %r11 -; AVX2-NEXT: andq 16(%rdi), %rdx -; AVX2-NEXT: andq 40(%rdi), %r8 -; AVX2-NEXT: andq 56(%rdi), %r10 -; AVX2-NEXT: andq 24(%rdi), %rax -; AVX2-NEXT: orq %r11, %rdx -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: andq (%rdi), %rcx -; AVX2-NEXT: orq %r9, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: andq 8(%rdi), %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: orq %rax, %rsi -; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: setne %al -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; X64-LABEL: reset_eq_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setae %al +; X64-NEXT: btrl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %mask = xor i512 %bit, -1 + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = and i512 %ld, %mask + %cmp = icmp eq i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { +; X86-LABEL: set_ne_i512: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl ; -; AVX512-LABEL: test_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx -; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq -16(%rsp,%rbx), %r11 -; AVX512-NEXT: movq -8(%rsp,%rbx), %r10 -; AVX512-NEXT: shldq %cl, %r11, %r10 -; AVX512-NEXT: movq -32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT: movq %r15, %r8 -; AVX512-NEXT: shldq %cl, %r9, %r8 -; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi -; AVX512-NEXT: shldq %cl, %rsi, %rdx -; AVX512-NEXT: shldq %cl, %r15, %r11 -; AVX512-NEXT: shldq %cl, %r14, %r9 -; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT: shldq %cl, %rbx, %rsi -; AVX512-NEXT: shlxq %rcx, %rbx, %rcx -; AVX512-NEXT: andq 32(%rdi), %r9 -; AVX512-NEXT: andq 48(%rdi), %r11 -; AVX512-NEXT: andq 16(%rdi), %rdx -; AVX512-NEXT: andq 40(%rdi), %r8 -; AVX512-NEXT: andq 56(%rdi), %r10 -; AVX512-NEXT: andq 24(%rdi), %rax -; AVX512-NEXT: orq %r11, %rdx -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: andq (%rdi), %rcx -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: andq 8(%rdi), %rsi -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: setne %al -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: set_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btsl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs %ld = load i512, ptr %word %test = and i512 %ld, %bit + %res = or i512 %ld, %bit %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word ret i1 %cmp } -define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: complement_ne_i512: +define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { +; X86-LABEL: init_eq_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $272, %esp # imm = 0x110 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl 56(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl 24(%edx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 12(%eax), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl 60(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 28(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 240(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 32(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 16(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl %esi, %edi -; X86-NEXT: movl 52(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl %ebx, 60(%edx) -; X86-NEXT: movl %edi, 56(%edx) -; X86-NEXT: movl %ecx, 52(%edx) -; X86-NEXT: movl %esi, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: complement_ne_i512: +; SSE-LABEL: init_eq_i512: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx ; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq (%rsp,%rbx), %rsi -; SSE-NEXT: movq 8(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 32(%rsp,%rbx), %r8 -; SSE-NEXT: movq 40(%rsp,%rbx), %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq 16(%rsp,%rbx), %r9 -; SSE-NEXT: movq 24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r10 -; SSE-NEXT: shldq %cl, %r9, %r10 -; SSE-NEXT: movq -8(%rsp,%rbx), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -16(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: movq 24(%rdi), %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 16(%rdi), %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %r8, %r13 -; SSE-NEXT: andq %rsi, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %rcx, %r13 -; SSE-NEXT: andq %rbp, %r13 -; SSE-NEXT: andq %rax, %r15 -; SSE-NEXT: orq %r13, %r15 -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: movq %r14, %rcx -; SSE-NEXT: andq %r9, %rcx -; SSE-NEXT: movq (%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rbx, %r13 -; SSE-NEXT: orq %rcx, %r13 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r12 -; SSE-NEXT: andq %r10, %r12 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: andq %r11, %rax -; SSE-NEXT: orq %r12, %rax -; SSE-NEXT: orq %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: xorq %rcx, %r10 -; SSE-NEXT: xorq %r14, %r9 -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT: xorq %rdx, %r11 -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: movq %r8, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r9, 32(%rdi) -; SSE-NEXT: movq %r10, 40(%rdi) -; SSE-NEXT: movq %rsi, 16(%rdi) -; SSE-NEXT: movq %r15, 24(%rdi) -; SSE-NEXT: movq %rbx, (%rdi) -; SSE-NEXT: movq %r11, 8(%rdi) -; SSE-NEXT: setne %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp +; SSE-NEXT: andl $60, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d +; SSE-NEXT: setae %al +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: complement_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, (%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rbx -; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT: movq %rbp, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT: shldq %cl, %r8, %r13 -; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shldq %cl, %r9, %r10 -; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT: shldq %cl, %r11, %rsi -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r8, %r14 -; AVX2-NEXT: andq %rsi, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq 56(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r13, %r15 -; AVX2-NEXT: movq 24(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %rax, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq (%rsp,%rbx), %rdx -; AVX2-NEXT: movq 32(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r9, %r15 -; AVX2-NEXT: shlxq %rcx, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq (%rdi), %rbx -; AVX2-NEXT: movq %rbx, %rbp -; AVX2-NEXT: andq %rax, %rbp -; AVX2-NEXT: orq %r15, %rbp -; AVX2-NEXT: orq %r12, %rbp -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: andq %r10, %rcx -; AVX2-NEXT: movq 8(%rdi), %r15 -; AVX2-NEXT: movq %r15, %r12 -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: orq %rcx, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT: xorq %rax, %r10 -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: xorq %r15, %r11 -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: movq %r8, 48(%rdi) -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r9, 32(%rdi) -; AVX2-NEXT: movq %r10, 40(%rdi) -; AVX2-NEXT: movq %rsi, 16(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %rbx, (%rdi) -; AVX2-NEXT: movq %r11, 8(%rdi) -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $72, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: complement_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, (%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT: movq %rbp, %rax -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT: shldq %cl, %r8, %r13 -; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %r10 -; AVX512-NEXT: shldq %cl, %r9, %r10 -; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT: shldq %cl, %r11, %rsi -; AVX512-NEXT: shldq %cl, %r14, %r8 -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r8, %r14 -; AVX512-NEXT: andq %rsi, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq 56(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r13, %r15 -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %rax, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: movq (%rsp,%rbx), %rdx -; AVX512-NEXT: movq 32(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r9, %r15 -; AVX512-NEXT: shlxq %rcx, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq (%rdi), %rbx -; AVX512-NEXT: movq %rbx, %rbp -; AVX512-NEXT: andq %rax, %rbp -; AVX512-NEXT: orq %r15, %rbp -; AVX512-NEXT: orq %r12, %rbp -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rdx, %r11 -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andq %r10, %rcx -; AVX512-NEXT: movq 8(%rdi), %r15 -; AVX512-NEXT: movq %r15, %r12 -; AVX512-NEXT: andq %r11, %r12 -; AVX512-NEXT: orq %rcx, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: xorq %rax, %r10 -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: xorq %r15, %r11 -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: movq %r8, 48(%rdi) -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r9, 32(%rdi) -; AVX512-NEXT: movq %r10, 40(%rdi) -; AVX512-NEXT: movq %rsi, 16(%rdi) -; AVX512-NEXT: movq %rcx, 24(%rdi) -; AVX512-NEXT: movq %rbx, (%rdi) -; AVX512-NEXT: movq %r11, 8(%rdi) -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $72, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i512: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $60, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs + %mask = xor i512 %bit, -1 + %val0 = zext i1 %value to i512 + %val = shl nuw i512 %val0, %ofs %ld = load i512, ptr %word %test = and i512 %ld, %bit - %res = xor i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 + %res0 = and i512 %ld, %mask + %res = or i512 %res0, %val + %cmp = icmp eq i512 %test, 0 store i512 %res, ptr %word ret i1 %cmp } -define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: reset_eq_i512: +; i4096 + +define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { +; X86-LABEL: test_ne_i4096: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $4064, %edx # imm = 0xFE0 +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: test_ne_i4096: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $4064, %eax # imm = 0xFE0 +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq + %rem = and i32 %position, 4095 + %ofs = zext nneg i32 %rem to i4096 + %bit = shl nuw i4096 1, %ofs + %ld = load i4096, ptr %word + %test = and i4096 %ld, %bit + %cmp = icmp ne i4096 %test, 0 + ret i1 %cmp +} + +; Special Cases + +; Multiple uses of the stored value +define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_cmpz_i128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: andl $96, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: xorl %edx, (%eax,%ecx) +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: orl 12(%eax), %edx +; X86-NEXT: orl 8(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; SSE-LABEL: complement_cmpz_i128: +; SSE: # %bb.0: +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: movl $1, %eax +; SSE-NEXT: shll %cl, %eax +; SSE-NEXT: andl $96, %ecx +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: xorl %eax, (%rdi,%rcx) +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: orq 8(%rdi), %rax +; SSE-NEXT: setne %al +; SSE-NEXT: retq +; +; AVX-LABEL: complement_cmpz_i128: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $esi killed $esi def $rsi +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: shlxl %esi, %eax, %eax +; AVX-NEXT: andl $96, %esi +; AVX-NEXT: shrl $3, %esi +; AVX-NEXT: xorl %eax, (%rdi,%rsi) +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: orq 8(%rdi), %rax +; AVX-NEXT: setne %al +; AVX-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ld = load i128, ptr %word + %res = xor i128 %ld, %bit + store i128 %res, ptr %word + %cmp = icmp ne i128 %res, 0 + ret i1 %cmp +} + +; Load hidden behind bitcast +define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_ne_i128_bitcast: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp @@ -2188,614 +966,298 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $288, %esp # imm = 0x120 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 4(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edi), %eax -; X86-NEXT: andl $31, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl 12(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edi), %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 52(%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 56(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %ecx +; X86-NEXT: subl $80, %esp +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movzwl (%eax), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl 44(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %ecx +; X86-NEXT: movzwl 12(%eax), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 256(%esp,%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %esi, %edi -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: movl 32(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %edx +; X86-NEXT: movzwl 14(%eax), %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esi), %ecx +; X86-NEXT: shll $16, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movzwl 2(%eax), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movzwl 4(%eax), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movzwl 6(%eax), %esi +; X86-NEXT: movzwl 8(%eax), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: movl 52(%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzwl 10(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shll $16, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: andb $96, %bl +; X86-NEXT: shrb $3, %bl +; X86-NEXT: movzbl %bl, %edi +; X86-NEXT: movl 32(%esp,%edi), %edi +; X86-NEXT: btcl %eax, %edi +; X86-NEXT: andl $96, %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %edi, (%ecx,%eax) +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movw %dx, 14(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 60(%eax) -; X86-NEXT: movl %esi, 56(%eax) -; X86-NEXT: movl %ecx, 52(%eax) +; X86-NEXT: movw %dx, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 44(%eax) +; X86-NEXT: movw %cx, 10(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 40(%eax) +; X86-NEXT: movw %cx, 8(%eax) +; X86-NEXT: movw %si, 6(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 36(%eax) +; X86-NEXT: movw %cx, 4(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 32(%eax) +; X86-NEXT: movw %cx, 2(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 24(%eax) -; X86-NEXT: movl %ebx, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 48(%eax) -; X86-NEXT: sete %al +; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx ; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; SSE2-LABEL: complement_ne_i128_bitcast: +; SSE2: # %bb.0: +; SSE2-NEXT: # kill: def $esi killed $esi def $rsi +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movq 8(%rdi), %rax +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: andb $32, %cl +; SSE2-NEXT: shrdq %cl, %rax, %rdx +; SSE2-NEXT: shrq %cl, %rax +; SSE2-NEXT: testb $64, %sil +; SSE2-NEXT: cmoveq %rdx, %rax +; SSE2-NEXT: btcl %esi, %eax +; SSE2-NEXT: andl $96, %esi +; SSE2-NEXT: shrl $3, %esi +; SSE2-NEXT: movl %eax, (%rdi,%rsi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: complement_ne_i128_bitcast: +; SSE4: # %bb.0: +; SSE4-NEXT: # kill: def $esi killed $esi def $rsi +; SSE4-NEXT: movdqa (%rdi), %xmm0 +; SSE4-NEXT: pextrq $1, %xmm0, %rax +; SSE4-NEXT: movq %xmm0, %rdx +; SSE4-NEXT: movl %esi, %ecx +; SSE4-NEXT: andb $32, %cl +; SSE4-NEXT: shrdq %cl, %rax, %rdx +; SSE4-NEXT: shrq %cl, %rax +; SSE4-NEXT: testb $64, %sil +; SSE4-NEXT: cmoveq %rdx, %rax +; SSE4-NEXT: btcl %esi, %eax +; SSE4-NEXT: andl $96, %esi +; SSE4-NEXT: shrl $3, %esi +; SSE4-NEXT: movl %eax, (%rdi,%rsi) +; SSE4-NEXT: retq +; +; AVX-LABEL: complement_ne_i128_bitcast: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $esi killed $esi def $rsi +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: vmovq %xmm0, %rdx +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: andb $32, %cl +; AVX-NEXT: shrdq %cl, %rax, %rdx +; AVX-NEXT: shrxq %rcx, %rax, %rax +; AVX-NEXT: testb $64, %sil +; AVX-NEXT: cmoveq %rdx, %rax +; AVX-NEXT: btcl %esi, %eax +; AVX-NEXT: andl $96, %esi +; AVX-NEXT: shrl $3, %esi +; AVX-NEXT: movl %eax, (%rdi,%rsi) +; AVX-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ldv = load <8 x i16>, ptr %word + %ld = bitcast <8 x i16> %ldv to i128 + %test = and i128 %ld, %bit + %res = xor i128 %ld, %bit + store i128 %res, ptr %word + ret <8 x i16> %ldv +} + +; Multiple loads in store chain +define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { +; X86-LABEL: reset_multiload_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: btrl %edx, %ebx +; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl %ebx, (%ecx,%esi) +; X86-NEXT: jae .LBB23_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB23_2: +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl +; +; X64-LABEL: reset_multiload_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %r9d +; X64-NEXT: movl %r9d, %r8d +; X64-NEXT: btrl %esi, %r8d +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: btl %esi, %r9d +; X64-NEXT: jb .LBB23_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: .LBB23_2: +; X64-NEXT: movl %r8d, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %mask = xor i128 %bit, -1 + %ld = load i128, ptr %word + %sel = load i32, ptr %p + %test = and i128 %ld, %bit + %res = and i128 %ld, %mask + %cmp = icmp eq i128 %test, 0 + store i128 %res, ptr %word + %ret = select i1 %cmp, i32 %sel, i32 0 + ret i32 %ret +} + +; Multiple uses of the store chain AND stored value +define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind { +; X86-LABEL: chain_reset_i256: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-2, %edi +; X86-NEXT: roll %cl, %edi +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $28, %ecx +; X86-NEXT: andl %edi, (%esi,%ecx) +; X86-NEXT: movl 8(%esi), %ebx +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 12(%esi), %ebp +; X86-NEXT: orl 28(%esi), %ebp +; X86-NEXT: orl 20(%esi), %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl 24(%esi), %ebx +; X86-NEXT: movl 16(%esi), %ebp +; X86-NEXT: orl %edi, %ebp +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl %edi, (%edx) +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: orl %ecx, %ebp +; X86-NEXT: jne .LBB24_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl %esi, %eax +; X86-NEXT: .LBB24_2: +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_eq_i512: +; SSE-LABEL: chain_reset_i256: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rdx -; SSE-NEXT: movq (%rsp,%rdx), %r9 -; SSE-NEXT: movq 8(%rsp,%rdx), %r8 -; SSE-NEXT: movq %r8, %rsi -; SSE-NEXT: shldq %cl, %r9, %rsi -; SSE-NEXT: movq -8(%rsp,%rdx), %rax -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: movq 16(%rsp,%rdx), %r14 -; SSE-NEXT: movq 24(%rsp,%rdx), %r10 -; SSE-NEXT: movq %r10, %rbx -; SSE-NEXT: shldq %cl, %r14, %rbx -; SSE-NEXT: shldq %cl, %r8, %r14 -; SSE-NEXT: movq 32(%rsp,%rdx), %r13 -; SSE-NEXT: movq 40(%rsp,%rdx), %r12 -; SSE-NEXT: shldq %cl, %r13, %r12 -; SSE-NEXT: shldq %cl, %r10, %r13 -; SSE-NEXT: movq -16(%rsp,%rdx), %rdx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq %r12, %rbp -; SSE-NEXT: movq %r9, %r15 -; SSE-NEXT: movq %rsi, %r11 -; SSE-NEXT: movq 16(%rdi), %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r13 -; SSE-NEXT: andq %r8, %r9 -; SSE-NEXT: orq %r13, %r9 -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r12 -; SSE-NEXT: movq 24(%rdi), %r10 -; SSE-NEXT: andq %r10, %rsi -; SSE-NEXT: orq %r12, %rsi -; SSE-NEXT: movq %r14, %r13 -; SSE-NEXT: movq 32(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r14 -; SSE-NEXT: movq %rdx, %r12 +; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx +; SSE-NEXT: movl $-2, %eax +; SSE-NEXT: roll %cl, %eax +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: andl $28, %ecx +; SSE-NEXT: andl %eax, (%rdi,%rcx) ; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %rdx -; SSE-NEXT: orq %r14, %rdx -; SSE-NEXT: orq %r9, %rdx -; SSE-NEXT: movq %rbx, %r14 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: andq %rcx, %rbx -; SSE-NEXT: movq %rax, %r9 ; SSE-NEXT: movq 8(%rdi), %r8 -; SSE-NEXT: andq %r8, %rax -; SSE-NEXT: orq %rbx, %rax -; SSE-NEXT: orq %rsi, %rax -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq %r10, %r11 -; SSE-NEXT: notq %r15 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq %rcx, %r14 -; SSE-NEXT: notq %r13 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT: notq %rbp -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT: notq %rcx -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; SSE-NEXT: notq %r9 -; SSE-NEXT: andq %r8, %r9 -; SSE-NEXT: notq %r12 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rcx, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r13, 32(%rdi) -; SSE-NEXT: movq %r14, 40(%rdi) -; SSE-NEXT: movq %r15, 16(%rdi) -; SSE-NEXT: movq %r11, 24(%rdi) -; SSE-NEXT: movq %r12, (%rdi) -; SSE-NEXT: movq %r9, 8(%rdi) -; SSE-NEXT: sete %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp +; SSE-NEXT: orq 24(%rdi), %r8 +; SSE-NEXT: movq 16(%rdi), %rdi +; SSE-NEXT: orq %rcx, %rdi +; SSE-NEXT: movl (%rsi), %eax +; SSE-NEXT: movl %ecx, (%rsi) +; SSE-NEXT: movl (%rdx), %ecx +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: orq %r8, %rdi +; SSE-NEXT: cmovnel %ecx, %eax ; SSE-NEXT: retq ; -; AVX2-LABEL: reset_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: pushq %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rdx -; AVX2-NEXT: movq -48(%rsp,%rdx), %r8 -; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx -; AVX2-NEXT: movq %rbx, %rax -; AVX2-NEXT: shldq %cl, %r8, %rax -; AVX2-NEXT: movq -16(%rsp,%rdx), %r10 -; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi -; AVX2-NEXT: shldq %cl, %r10, %rsi -; AVX2-NEXT: movq -32(%rsp,%rdx), %r11 -; AVX2-NEXT: movq -24(%rsp,%rdx), %r14 -; AVX2-NEXT: movq %r14, %r9 -; AVX2-NEXT: shldq %cl, %r11, %r9 -; AVX2-NEXT: movq -64(%rsp,%rdx), %r15 -; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx -; AVX2-NEXT: shldq %cl, %rdx, %r8 -; AVX2-NEXT: shldq %cl, %r14, %r10 -; AVX2-NEXT: shldq %cl, %rbx, %r11 -; AVX2-NEXT: shldq %cl, %r15, %rdx -; AVX2-NEXT: shlxq %rcx, %r15, %rcx -; AVX2-NEXT: movq 24(%rdi), %rbx -; AVX2-NEXT: movq 56(%rdi), %r14 -; AVX2-NEXT: movq 16(%rdi), %r15 -; AVX2-NEXT: movq 48(%rdi), %r13 -; AVX2-NEXT: movq 32(%rdi), %rbp -; AVX2-NEXT: andnq %rbp, %r11, %r12 -; AVX2-NEXT: andq %r11, %rbp -; AVX2-NEXT: andnq %r13, %r10, %r11 -; AVX2-NEXT: andq %r10, %r13 -; AVX2-NEXT: andnq %r15, %r8, %r10 -; AVX2-NEXT: andq %r8, %r15 -; AVX2-NEXT: movq 40(%rdi), %r8 -; AVX2-NEXT: orq %r13, %r15 -; AVX2-NEXT: andnq %r8, %r9, %r13 -; AVX2-NEXT: andq %r9, %r8 -; AVX2-NEXT: andnq %r14, %rsi, %r9 -; AVX2-NEXT: andq %rsi, %r14 -; AVX2-NEXT: andnq %rbx, %rax, %rsi -; AVX2-NEXT: andq %rax, %rbx -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: orq %r14, %rbx -; AVX2-NEXT: andnq %rax, %rcx, %r14 -; AVX2-NEXT: andq %rcx, %rax -; AVX2-NEXT: orq %rbp, %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: andnq %rcx, %rdx, %r15 -; AVX2-NEXT: andq %rdx, %rcx -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rbx, %rcx -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: movq %r11, 48(%rdi) -; AVX2-NEXT: movq %r9, 56(%rdi) -; AVX2-NEXT: movq %r12, 32(%rdi) -; AVX2-NEXT: movq %r13, 40(%rdi) -; AVX2-NEXT: movq %r10, 16(%rdi) -; AVX2-NEXT: movq %rsi, 24(%rdi) -; AVX2-NEXT: movq %r14, (%rdi) -; AVX2-NEXT: movq %r15, 8(%rdi) -; AVX2-NEXT: sete %al -; AVX2-NEXT: addq $8, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: reset_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: pushq %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq -48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %r8, %rax -; AVX512-NEXT: movq -16(%rsp,%rbx), %r10 -; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi -; AVX512-NEXT: shldq %cl, %r10, %rsi -; AVX512-NEXT: movq -32(%rsp,%rbx), %r11 -; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT: movq %r15, %r9 -; AVX512-NEXT: shldq %cl, %r11, %r9 -; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx -; AVX512-NEXT: shldq %cl, %rdx, %r8 -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: shldq %cl, %r14, %r11 -; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT: shldq %cl, %rbx, %rdx -; AVX512-NEXT: shlxq %rcx, %rbx, %rcx -; AVX512-NEXT: movq 24(%rdi), %rbx -; AVX512-NEXT: movq 56(%rdi), %r14 -; AVX512-NEXT: movq 16(%rdi), %r15 -; AVX512-NEXT: movq 48(%rdi), %r13 -; AVX512-NEXT: movq 32(%rdi), %rbp -; AVX512-NEXT: andnq %rbp, %r11, %r12 -; AVX512-NEXT: andq %r11, %rbp -; AVX512-NEXT: andnq %r13, %r10, %r11 -; AVX512-NEXT: andq %r10, %r13 -; AVX512-NEXT: andnq %r15, %r8, %r10 -; AVX512-NEXT: andq %r8, %r15 -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: orq %r13, %r15 -; AVX512-NEXT: andnq %r8, %r9, %r13 -; AVX512-NEXT: andq %r9, %r8 -; AVX512-NEXT: andnq %r14, %rsi, %r9 -; AVX512-NEXT: andq %rsi, %r14 -; AVX512-NEXT: andnq %rbx, %rax, %rsi -; AVX512-NEXT: andq %rax, %rbx -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: orq %r14, %rbx -; AVX512-NEXT: andnq %rax, %rcx, %r14 -; AVX512-NEXT: andq %rcx, %rax -; AVX512-NEXT: orq %rbp, %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: orq %r15, %rax -; AVX512-NEXT: andnq %rcx, %rdx, %r15 -; AVX512-NEXT: andq %rdx, %rcx -; AVX512-NEXT: orq %r8, %rcx -; AVX512-NEXT: orq %rbx, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: movq %r11, 48(%rdi) -; AVX512-NEXT: movq %r9, 56(%rdi) -; AVX512-NEXT: movq %r12, 32(%rdi) -; AVX512-NEXT: movq %r13, 40(%rdi) -; AVX512-NEXT: movq %r10, 16(%rdi) -; AVX512-NEXT: movq %rsi, 24(%rdi) -; AVX512-NEXT: movq %r14, (%rdi) -; AVX512-NEXT: movq %r15, 8(%rdi) -; AVX512-NEXT: sete %al -; AVX512-NEXT: addq $8, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = and i512 %ld, %mask - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp +; AVX-LABEL: chain_reset_i256: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX-NEXT: movl $-2, %eax +; AVX-NEXT: roll %cl, %eax +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $28, %ecx +; AVX-NEXT: andl %eax, (%rdi,%rcx) +; AVX-NEXT: vmovdqu (%rdi), %ymm0 +; AVX-NEXT: movl (%rdi), %ecx +; AVX-NEXT: movl (%rsi), %eax +; AVX-NEXT: movl %ecx, (%rsi) +; AVX-NEXT: movl (%rdx), %ecx +; AVX-NEXT: addl %ecx, %eax +; AVX-NEXT: vptest %ymm0, %ymm0 +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %rem = and i32 %position, 255 + %ofs = zext nneg i32 %rem to i256 + %bit = shl nuw i256 1, %ofs + %ld0 = load i256, ptr %p0 + %msk = xor i256 %bit, -1 + %res = and i256 %ld0, %msk + store i256 %res, ptr %p0 + %cmp = icmp ne i256 %res, 0 + %ld1 = load i32, ptr %p1 + %trunc = trunc i256 %res to i32 + store i32 %trunc, ptr %p1 + %ld2 = load i32, ptr %p2 + %add = add i32 %ld1, %ld2 + %sel = select i1 %cmp, i32 %ld2, i32 %add + ret i32 %sel } -define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { -; X86-LABEL: set_ne_i512: +; BTC/BT/BTS sequence on same i128 +define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind { +; X86-LABEL: sequence_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp @@ -2803,27 +1265,9 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $272, %esp # imm = 0x110 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: subl $144, %esp +; X86-NEXT: movb 20(%ebp), %ch +; X86-NEXT: movb 12(%ebp), %cl ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2832,225 +1276,85 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %edx +; X86-NEXT: movl 60(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: shll %cl, %edi ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %esi +; X86-NEXT: movb %ch, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 84(%esp,%eax), %edx +; X86-NEXT: movl 88(%esp,%eax), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movzbl 20(%ebp), %ecx +; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %esi +; X86-NEXT: movl 80(%esp,%eax), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 92(%esp,%eax), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl 56(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl 24(%edx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 12(%eax), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl 60(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 28(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 240(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 32(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 16(%esi), %eax +; X86-NEXT: shldl %cl, %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl %esi, %edi -; X86-NEXT: movl 52(%eax), %eax +; X86-NEXT: movl %esi, %eax +; X86-NEXT: shll %cl, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl 8(%eax), %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl 12(%eax), %esi +; X86-NEXT: xorl (%eax), %edi +; X86-NEXT: xorl 4(%eax), %ebx +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: andb $96, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 96(%esp,%eax), %eax +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setae %al ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl %ebx, 60(%edx) -; X86-NEXT: movl %edi, 56(%edx) -; X86-NEXT: movl %ecx, 52(%edx) -; X86-NEXT: movl %esi, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: setne %al +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, (%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -3058,324 +1362,135 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: set_ne_i512: +; SSE-LABEL: sequence_i128: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %ecx, %eax ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq (%rsp,%rbx), %rsi -; SSE-NEXT: movq 8(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 32(%rsp,%rbx), %r8 -; SSE-NEXT: movq 40(%rsp,%rbx), %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq 16(%rsp,%rbx), %r9 -; SSE-NEXT: movq 24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r10 -; SSE-NEXT: shldq %cl, %r9, %r10 -; SSE-NEXT: movq -8(%rsp,%rbx), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -16(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: movq 24(%rdi), %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 16(%rdi), %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %r8, %r13 -; SSE-NEXT: andq %rsi, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %rcx, %r13 -; SSE-NEXT: andq %rbp, %r13 -; SSE-NEXT: andq %rax, %r15 -; SSE-NEXT: orq %r13, %r15 -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: movq %r14, %rcx -; SSE-NEXT: andq %r9, %rcx -; SSE-NEXT: movq (%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rbx, %r13 -; SSE-NEXT: orq %rcx, %r13 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r12 -; SSE-NEXT: andq %r10, %r12 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: andq %r11, %rax -; SSE-NEXT: orq %r12, %rax -; SSE-NEXT: orq %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: orq %rcx, %r10 -; SSE-NEXT: orq %r14, %r9 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: movq %r8, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r9, 32(%rdi) -; SSE-NEXT: movq %r10, 40(%rdi) -; SSE-NEXT: movq %rsi, 16(%rdi) -; SSE-NEXT: movq %r15, 24(%rdi) -; SSE-NEXT: movq %rbx, (%rdi) -; SSE-NEXT: movq %r11, 8(%rdi) -; SSE-NEXT: setne %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp +; SSE-NEXT: movl $1, %r8d +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %r8, %rsi +; SSE-NEXT: movl $1, %r9d +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: xorl %r11d, %r11d +; SSE-NEXT: testb $64, %cl +; SSE-NEXT: cmovneq %r9, %rsi +; SSE-NEXT: cmovneq %r11, %r9 +; SSE-NEXT: xorl %r10d, %r10d +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shldq %cl, %r8, %r10 +; SSE-NEXT: shlq %cl, %r8 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: cmovneq %r8, %r10 +; SSE-NEXT: cmovneq %r11, %r8 +; SSE-NEXT: xorq 8(%rdi), %rsi +; SSE-NEXT: xorq (%rdi), %r9 +; SSE-NEXT: movl %edx, %ecx +; SSE-NEXT: andb $32, %cl +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: shrdq %cl, %rsi, %rax +; SSE-NEXT: movq %rsi, %r11 +; SSE-NEXT: shrq %cl, %r11 +; SSE-NEXT: testb $64, %dl +; SSE-NEXT: cmoveq %rax, %r11 +; SSE-NEXT: btl %edx, %r11d +; SSE-NEXT: setae %al +; SSE-NEXT: orq %r10, %rsi +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) ; SSE-NEXT: retq ; -; AVX2-LABEL: set_ne_i512: +; AVX2-LABEL: sequence_i128: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, (%rsp) +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rbx -; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT: movq %rbp, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT: shldq %cl, %r8, %r13 -; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shldq %cl, %r9, %r10 -; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT: shldq %cl, %r11, %rsi -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r8, %r14 -; AVX2-NEXT: andq %rsi, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq 56(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r13, %r15 -; AVX2-NEXT: movq 24(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %rax, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq (%rsp,%rbx), %rdx -; AVX2-NEXT: movq 32(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r9, %r15 -; AVX2-NEXT: shlxq %rcx, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq (%rdi), %rbx -; AVX2-NEXT: movq %rbx, %rbp -; AVX2-NEXT: andq %rax, %rbp -; AVX2-NEXT: orq %r15, %rbp -; AVX2-NEXT: orq %r12, %rbp -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: andq %r10, %rcx -; AVX2-NEXT: movq 8(%rdi), %r15 -; AVX2-NEXT: movq %r15, %r12 -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: orq %rcx, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT: orq %rax, %r10 -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: orq %r15, %r11 -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: movq %r8, 48(%rdi) -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r9, 32(%rdi) -; AVX2-NEXT: movq %r10, 40(%rdi) -; AVX2-NEXT: movq %rsi, 16(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %rbx, (%rdi) -; AVX2-NEXT: movq %r11, 8(%rdi) -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $72, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: movl $1, %r10d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %r10, %rsi +; AVX2-NEXT: shlxq %rcx, %r10, %r8 +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %r8, %rsi +; AVX2-NEXT: cmovneq %r9, %r8 +; AVX2-NEXT: xorl %r11d, %r11d +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: shlxq %rax, %r10, %r10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: cmovneq %r10, %r11 +; AVX2-NEXT: cmovneq %r9, %r10 +; AVX2-NEXT: xorq 8(%rdi), %rsi +; AVX2-NEXT: xorq (%rdi), %r8 +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: andb $32, %cl +; AVX2-NEXT: movq %r8, %rax +; AVX2-NEXT: shrdq %cl, %rsi, %rax +; AVX2-NEXT: shrxq %rcx, %rsi, %rcx +; AVX2-NEXT: testb $64, %dl +; AVX2-NEXT: cmoveq %rax, %rcx +; AVX2-NEXT: btl %edx, %ecx +; AVX2-NEXT: setae %al +; AVX2-NEXT: orq %r11, %rsi +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) ; AVX2-NEXT: retq ; -; AVX512-LABEL: set_ne_i512: +; AVX512-LABEL: sequence_i128: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, (%rsp) +; AVX512-NEXT: movl %ecx, %eax ; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT: movq %rbp, %rax -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT: shldq %cl, %r8, %r13 -; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %r10 -; AVX512-NEXT: shldq %cl, %r9, %r10 -; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT: shldq %cl, %r11, %rsi -; AVX512-NEXT: shldq %cl, %r14, %r8 -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r8, %r14 -; AVX512-NEXT: andq %rsi, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq 56(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r13, %r15 -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %rax, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: movq (%rsp,%rbx), %rdx -; AVX512-NEXT: movq 32(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r9, %r15 -; AVX512-NEXT: shlxq %rcx, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq (%rdi), %rbx -; AVX512-NEXT: movq %rbx, %rbp -; AVX512-NEXT: andq %rax, %rbp -; AVX512-NEXT: orq %r15, %rbp -; AVX512-NEXT: orq %r12, %rbp -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rdx, %r11 -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andq %r10, %rcx -; AVX512-NEXT: movq 8(%rdi), %r15 -; AVX512-NEXT: movq %r15, %r12 -; AVX512-NEXT: andq %r11, %r12 -; AVX512-NEXT: orq %rcx, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: orq %rax, %r10 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: orq %r15, %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: movq %r8, 48(%rdi) -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r9, 32(%rdi) -; AVX512-NEXT: movq %r10, 40(%rdi) -; AVX512-NEXT: movq %rsi, 16(%rdi) -; AVX512-NEXT: movq %rcx, 24(%rdi) -; AVX512-NEXT: movq %rbx, (%rdi) -; AVX512-NEXT: movq %r11, 8(%rdi) -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $72, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: movl $1, %r9d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %r9, %rsi +; AVX512-NEXT: xorl %r10d, %r10d +; AVX512-NEXT: shlxq %rcx, %r9, %r8 +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %r8, %rsi +; AVX512-NEXT: cmovneq %r10, %r8 +; AVX512-NEXT: xorl %r11d, %r11d +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: shldq %cl, %r9, %r11 +; AVX512-NEXT: shlxq %rax, %r9, %r9 +; AVX512-NEXT: testb $64, %al +; AVX512-NEXT: cmovneq %r9, %r11 +; AVX512-NEXT: cmovneq %r10, %r9 +; AVX512-NEXT: xorq 8(%rdi), %rsi +; AVX512-NEXT: xorq (%rdi), %r8 +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $32, %cl +; AVX512-NEXT: movq %r8, %rax +; AVX512-NEXT: shrdq %cl, %rsi, %rax +; AVX512-NEXT: shrxq %rcx, %rsi, %rcx +; AVX512-NEXT: testb $64, %dl +; AVX512-NEXT: cmoveq %rax, %rcx +; AVX512-NEXT: btl %edx, %ecx +; AVX512-NEXT: setae %al +; AVX512-NEXT: orq %r11, %rsi +; AVX512-NEXT: orq %r9, %r8 +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq %rsi, 8(%rdi) ; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = or i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp + %rem0 = and i32 %pos0, 127 + %rem1 = and i32 %pos1, 127 + %rem2 = and i32 %pos2, 127 + %ofs0 = zext nneg i32 %rem0 to i128 + %ofs1 = zext nneg i32 %rem1 to i128 + %ofs2 = zext nneg i32 %rem2 to i128 + %bit0 = shl nuw i128 1, %ofs0 + %bit1 = shl nuw i128 1, %ofs1 + %bit2 = shl nuw i128 1, %ofs2 + %ld = load i128, ptr %word + %res0 = xor i128 %ld, %bit0 + %test1 = and i128 %res0, %bit1 + %cmp1 = icmp eq i128 %test1, 0 + %res2 = or i128 %res0, %bit2 + store i128 %res2, ptr %word + ret i1 %cmp1 } -define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { -; X86-LABEL: init_eq_i512: +define i32 @blsr_u512(ptr %word) nounwind { +; X86-LABEL: blsr_u512: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp @@ -3383,126 +1498,215 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $432, %esp # imm = 0x1B0 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %edx, %esi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%esi), %eax +; X86-NEXT: subl $240, %esp +; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: movl 12(%ebx), %esi +; X86-NEXT: movl 28(%ebx), %eax +; X86-NEXT: movl 60(%ebx), %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%esi), %eax -; X86-NEXT: movl 48(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%esi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %ebx -; X86-NEXT: movzbl %bl, %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %edx, %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 44(%ebx), %edx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl 20(%ebx), %edx +; X86-NEXT: movl 52(%ebx), %eax ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl 4(%ebx), %edi +; X86-NEXT: movl 36(%ebx), %esi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 24(%ebx), %edx +; X86-NEXT: movl 56(%ebx), %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl 8(%ebx), %ecx +; X86-NEXT: movl 40(%ebx), %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl 16(%ebx), %edx +; X86-NEXT: movl 48(%ebx), %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl (%ebx), %esi +; X86-NEXT: movl 32(%ebx), %ebx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB26_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jne .LBB26_3 +; X86-NEXT: # %bb.4: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl $32, %eax +; X86-NEXT: jmp .LBB26_5 +; X86-NEXT: .LBB26_1: +; X86-NEXT: movl $512, %ecx # imm = 0x200 +; X86-NEXT: jmp .LBB26_41 +; X86-NEXT: .LBB26_3: +; X86-NEXT: rep bsfl %ebx, %eax +; X86-NEXT: .LBB26_5: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB26_6 +; X86-NEXT: # %bb.7: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %ecx +; X86-NEXT: jmp .LBB26_8 +; X86-NEXT: .LBB26_6: +; X86-NEXT: rep bsfl %ecx, %ecx +; X86-NEXT: .LBB26_8: # %cond.false ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: jne .LBB26_10 +; X86-NEXT: # %bb.9: # %cond.false +; X86-NEXT: addl $64, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: .LBB26_10: # %cond.false +; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: jne .LBB26_11 +; X86-NEXT: # %bb.12: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: je .LBB26_15 +; X86-NEXT: .LBB26_14: +; X86-NEXT: rep bsfl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: je .LBB26_17 +; X86-NEXT: jmp .LBB26_18 +; X86-NEXT: .LBB26_11: +; X86-NEXT: rep bsfl %esi, %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB26_14 +; X86-NEXT: .LBB26_15: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: addl $32, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: jne .LBB26_18 +; X86-NEXT: .LBB26_17: # %cond.false +; X86-NEXT: addl $64, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: .LBB26_18: # %cond.false ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %edx, %esi +; X86-NEXT: jne .LBB26_20 +; X86-NEXT: # %bb.19: # %cond.false +; X86-NEXT: subl $-128, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: .LBB26_20: # %cond.false +; X86-NEXT: addl $256, %eax # imm = 0x100 +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: testl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: jne .LBB26_21 +; X86-NEXT: # %bb.22: # %cond.false +; X86-NEXT: rep bsfl %edi, %ebx +; X86-NEXT: addl $32, %ebx +; X86-NEXT: jmp .LBB26_23 +; X86-NEXT: .LBB26_21: +; X86-NEXT: rep bsfl %edx, %ebx +; X86-NEXT: .LBB26_23: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB26_24 +; X86-NEXT: # %bb.25: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %ecx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: je .LBB26_27 +; X86-NEXT: jmp .LBB26_28 +; X86-NEXT: .LBB26_24: +; X86-NEXT: rep bsfl %ecx, %ecx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: jne .LBB26_28 +; X86-NEXT: .LBB26_27: # %cond.false +; X86-NEXT: addl $64, %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: .LBB26_28: # %cond.false +; X86-NEXT: testl %esi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: jne .LBB26_29 +; X86-NEXT: # %bb.30: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: je .LBB26_33 +; X86-NEXT: .LBB26_32: +; X86-NEXT: rep bsfl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: je .LBB26_35 +; X86-NEXT: jmp .LBB26_36 +; X86-NEXT: .LBB26_29: +; X86-NEXT: rep bsfl %esi, %ecx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB26_32 +; X86-NEXT: .LBB26_33: # %cond.false +; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: addl $32, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: jne .LBB26_36 +; X86-NEXT: .LBB26_35: # %cond.false +; X86-NEXT: addl $64, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: .LBB26_36: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edi, %esi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %edx, %esi +; X86-NEXT: jne .LBB26_38 +; X86-NEXT: # %bb.37: # %cond.false +; X86-NEXT: subl $-128, %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: .LBB26_38: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: jne .LBB26_40 +; X86-NEXT: # %bb.39: # %cond.false +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: .LBB26_40: # %cond.false +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: .LBB26_41: # %cond.end ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl %cl, %edi, %edx +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %esi, %edx ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -3518,6 +1722,7 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -3534,1948 +1739,133 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%ebx), %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $31, %ecx +; X86-NEXT: movl 56(%edx), %edi +; X86-NEXT: movl 60(%edx), %esi +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%ebx), %esi +; X86-NEXT: movl 52(%edx), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: notl %edi +; X86-NEXT: andl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebx), %esi +; X86-NEXT: movl 40(%edx), %eax +; X86-NEXT: movl 44(%edx), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebx), %eax +; X86-NEXT: movl 36(%edx), %esi +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%ebx), %esi +; X86-NEXT: movl 32(%edx), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl 28(%edx), %esi +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 24(%edx), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl 56(%edi), %ebx -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 52(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 48(%edi), %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl 4(%edx), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%edx), %eax +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 40(%edi), %ebx -; X86-NEXT: movl 44(%edi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 32(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 28(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 24(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 20(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 16(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: notl %edx ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 12(%edi), %eax -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl 12(%ebx), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi ; X86-NEXT: notl %esi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 8(%edi), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: orl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%ebx), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 20(%ebx), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: notl %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: notl %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%edi), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl (%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: negl %eax +; X86-NEXT: movl 208(%esp,%eax), %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 60(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 56(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 52(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 44(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 40(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 24(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %esi, 48(%eax) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: init_eq_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $216, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %r10 -; SSE-NEXT: movq 184(%rsp,%r10), %r11 -; SSE-NEXT: movq 192(%rsp,%r10), %rsi -; SSE-NEXT: movq %rsi, %r13 -; SSE-NEXT: shldq %cl, %r11, %r13 -; SSE-NEXT: movq 200(%rsp,%r10), %r15 -; SSE-NEXT: shldq %cl, %rsi, %r15 -; SSE-NEXT: movq 168(%rsp,%r10), %rbx -; SSE-NEXT: movq 176(%rsp,%r10), %rsi -; SSE-NEXT: movq %rsi, %r14 -; SSE-NEXT: shldq %cl, %rbx, %r14 -; SSE-NEXT: shldq %cl, %rsi, %r11 -; SSE-NEXT: movq 152(%rsp,%r10), %rax -; SSE-NEXT: movq 160(%rsp,%r10), %r8 -; SSE-NEXT: movq %r8, %r12 -; SSE-NEXT: shldq %cl, %rax, %r12 -; SSE-NEXT: shldq %cl, %r8, %rbx -; SSE-NEXT: movq 144(%rsp,%r10), %r9 -; SSE-NEXT: movq %r9, %r8 -; SSE-NEXT: shlq %cl, %r8 -; SSE-NEXT: shldq %cl, %r9, %rax -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movl %edx, %edx -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 16(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rsi, %r13 -; SSE-NEXT: andq %rdx, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %r15, %rsi -; SSE-NEXT: movq 56(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r15 -; SSE-NEXT: movq %rbx, %r13 -; SSE-NEXT: movq 24(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %rbx -; SSE-NEXT: orq %r15, %rbx -; SSE-NEXT: movq %r14, %rbp -; SSE-NEXT: movq 32(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r14 -; SSE-NEXT: movq %r8, %r15 -; SSE-NEXT: movq (%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r8 -; SSE-NEXT: orq %r14, %r8 -; SSE-NEXT: orq %r12, %r8 -; SSE-NEXT: movq %r11, %r12 -; SSE-NEXT: movq 40(%rdi), %r9 -; SSE-NEXT: andq %r9, %r11 -; SSE-NEXT: movq %rax, %r14 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %rax -; SSE-NEXT: orq %r11, %rax -; SSE-NEXT: orq %rbx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq 56(%rsp,%r10), %r11 -; SSE-NEXT: movq 64(%rsp,%r10), %rax -; SSE-NEXT: movq %rax, %rbx -; SSE-NEXT: shldq %cl, %r11, %rbx -; SSE-NEXT: orq %rbx, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: notq %rsi -; SSE-NEXT: movq 72(%rsp,%r10), %rbx -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: orq %rbx, %rsi -; SSE-NEXT: notq %rbp -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: movq 40(%rsp,%r10), %rax -; SSE-NEXT: movq 48(%rsp,%r10), %rdx -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: orq %rbx, %rbp -; SSE-NEXT: notq %r12 -; SSE-NEXT: andq %r9, %r12 -; SSE-NEXT: shldq %cl, %rdx, %r11 -; SSE-NEXT: movq 24(%rsp,%r10), %r9 -; SSE-NEXT: movq 32(%rsp,%r10), %rdx -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: shldq %cl, %r9, %rbx -; SSE-NEXT: orq %r11, %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: orq %rbx, %r11 -; SSE-NEXT: notq %r13 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: notq %r15 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: movq 16(%rsp,%r10), %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: orq %rdx, %r15 -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: orq %r9, %r14 -; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 48(%rdi) -; SSE-NEXT: movq %rsi, 56(%rdi) -; SSE-NEXT: movq %rbp, 32(%rdi) -; SSE-NEXT: movq %r12, 40(%rdi) -; SSE-NEXT: movq %r11, 16(%rdi) -; SSE-NEXT: movq %r13, 24(%rdi) -; SSE-NEXT: movq %r15, (%rdi) -; SSE-NEXT: movq %r14, 8(%rdi) -; SSE-NEXT: sete %al -; SSE-NEXT: addq $216, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: init_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $200, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %r8d -; AVX2-NEXT: andl $63, %r8d -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rsi -; AVX2-NEXT: movq 144(%rsp,%rsi), %r11 -; AVX2-NEXT: movq 152(%rsp,%rsi), %r12 -; AVX2-NEXT: movq %r12, %r10 -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %r11, %r10 -; AVX2-NEXT: movq 176(%rsp,%rsi), %r14 -; AVX2-NEXT: movq 184(%rsp,%rsi), %r9 -; AVX2-NEXT: shldq %cl, %r14, %r9 -; AVX2-NEXT: movq 160(%rsp,%rsi), %r15 -; AVX2-NEXT: movq 168(%rsp,%rsi), %r13 -; AVX2-NEXT: movq %r13, %rbx -; AVX2-NEXT: shldq %cl, %r15, %rbx -; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp -; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 136(%rsp,%rsi), %rax -; AVX2-NEXT: shldq %cl, %rax, %r11 -; AVX2-NEXT: shldq %cl, %r13, %r14 -; AVX2-NEXT: shldq %cl, %r12, %r15 -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rdx, (%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq 48(%rdi), %rbp -; AVX2-NEXT: movq 32(%rdi), %r13 -; AVX2-NEXT: andnq %r13, %r15, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r15, %r13 -; AVX2-NEXT: andnq %rbp, %r14, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r14, %rbp -; AVX2-NEXT: andnq %r12, %r11, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: andnq %rax, %rbx, %rcx -; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: andq %rbx, %rbp -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: andnq %rcx, %r9, %rbx -; AVX2-NEXT: andq %r9, %rcx -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: andnq %rax, %r10, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r10, %rax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: movq (%rdi), %r10 -; AVX2-NEXT: andnq %r10, %rcx, %r15 -; AVX2-NEXT: andq %rcx, %r10 -; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx -; AVX2-NEXT: movq 48(%rsp,%rsi), %r11 -; AVX2-NEXT: movq %r11, %r9 -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: orq %r13, %r10 -; AVX2-NEXT: orq %r12, %r10 -; AVX2-NEXT: movq 8(%rdi), %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andnq %r13, %rcx, %r12 -; AVX2-NEXT: andq %rcx, %r13 -; AVX2-NEXT: orq %rbp, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq 56(%rsp,%rsi), %rax -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %r11, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: orq %r9, %r14 -; AVX2-NEXT: orq %rax, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 24(%rsp,%rsi), %rax -; AVX2-NEXT: movq 32(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, %r11 -; AVX2-NEXT: shldq %cl, %rax, %r11 -; AVX2-NEXT: shldq %cl, %r9, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT: orq %r11, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: orq %rdx, %rbx -; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx -; AVX2-NEXT: movq 16(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, %r11 -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: shldq %cl, %r9, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: orq %r11, %r9 -; AVX2-NEXT: movq (%rsp,%rsi), %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: shlxq %r8, %rsi, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: orq %rax, %r15 -; AVX2-NEXT: orq %rdx, %r12 -; AVX2-NEXT: orq %r10, %r13 -; AVX2-NEXT: movq %r14, 48(%rdi) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: movq %rax, 56(%rdi) -; AVX2-NEXT: movq %rbp, 32(%rdi) -; AVX2-NEXT: movq %rbx, 40(%rdi) -; AVX2-NEXT: movq %r9, 16(%rdi) -; AVX2-NEXT: movq %r11, 24(%rdi) -; AVX2-NEXT: movq %r15, (%rdi) -; AVX2-NEXT: movq %r12, 8(%rdi) -; AVX2-NEXT: sete %al -; AVX2-NEXT: addq $200, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $184, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rsi -; AVX512-NEXT: movq 128(%rsp,%rsi), %r10 -; AVX512-NEXT: movq 136(%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rax -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 160(%rsp,%rsi), %r14 -; AVX512-NEXT: movq 168(%rsp,%rsi), %rax -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 144(%rsp,%rsi), %r15 -; AVX512-NEXT: movq 152(%rsp,%rsi), %r11 -; AVX512-NEXT: movq %r11, %rbx -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: movq 120(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %r10 -; AVX512-NEXT: shldq %cl, %r11, %r14 -; AVX512-NEXT: movq %rdi, %r9 -; AVX512-NEXT: movq 112(%rsp,%rsi), %r11 -; AVX512-NEXT: shldq %cl, %r12, %r15 -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq 48(%rdi), %r13 -; AVX512-NEXT: movq 32(%rdi), %rbp -; AVX512-NEXT: andnq %rbp, %r15, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r15, %rbp -; AVX512-NEXT: andnq %r13, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r14, %r13 -; AVX512-NEXT: andnq %r12, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r10, %r12 -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: orq %r13, %r12 -; AVX512-NEXT: andnq %r8, %rbx, %rdi -; AVX512-NEXT: andq %rbx, %r8 -; AVX512-NEXT: movq 56(%r9), %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: andnq %r13, %rdx, %r10 -; AVX512-NEXT: andq %rdx, %r13 -; AVX512-NEXT: movq 24(%r9), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: andnq %rax, %rdx, %r15 -; AVX512-NEXT: andq %rdx, %rax -; AVX512-NEXT: orq %r13, %rax -; AVX512-NEXT: shlxq %rcx, %r11, %r13 -; AVX512-NEXT: movq (%r9), %rdx -; AVX512-NEXT: andnq %rdx, %r13, %r14 -; AVX512-NEXT: andq %r13, %rdx -; AVX512-NEXT: orq %rbp, %rdx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r11, %rbp -; AVX512-NEXT: orq %r12, %rdx -; AVX512-NEXT: movq 8(%r9), %r13 -; AVX512-NEXT: andnq %r13, %rbp, %rbx -; AVX512-NEXT: andq %rbp, %r13 -; AVX512-NEXT: orq %r8, %r13 -; AVX512-NEXT: movq 24(%rsp,%rsi), %r8 -; AVX512-NEXT: orq %rax, %r13 -; AVX512-NEXT: movq 32(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, %r12 -; AVX512-NEXT: shldq %cl, %r8, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: orq %r12, %r11 -; AVX512-NEXT: movq 40(%rsp,%rsi), %r12 -; AVX512-NEXT: shldq %cl, %rax, %r12 -; AVX512-NEXT: orq %r12, %r10 -; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 8(%rsp,%rsi), %rax -; AVX512-NEXT: movq 16(%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rbp -; AVX512-NEXT: shldq %cl, %rax, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: orq %rbp, %r10 -; AVX512-NEXT: shldq %cl, %r12, %r8 -; AVX512-NEXT: orq %r8, %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq -8(%rsp,%rsi), %r8 -; AVX512-NEXT: movq (%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rbp -; AVX512-NEXT: shldq %cl, %r8, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: orq %rbp, %rdi -; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi -; AVX512-NEXT: shldq %cl, %r12, %rax -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: shlxq %rcx, %rsi, %rax -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: orq %rax, %r14 -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: orq %rdx, %r13 -; AVX512-NEXT: movq %r11, 48(%r9) -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, 56(%r9) -; AVX512-NEXT: movq %r10, 32(%r9) -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, 40(%r9) -; AVX512-NEXT: movq %rdi, 16(%r9) -; AVX512-NEXT: movq %r15, 24(%r9) -; AVX512-NEXT: movq %r14, (%r9) -; AVX512-NEXT: movq %rbx, 8(%r9) -; AVX512-NEXT: sete %al -; AVX512-NEXT: addq $184, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %val0 = zext i1 %value to i512 - %val = shl nuw i512 %val0, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res0 = and i512 %ld, %mask - %res = or i512 %res0, %val - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -; i4096 - -define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { -; X86-LABEL: test_ne_i4096: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $1792, %esp # imm = 0x700 -; X86-NEXT: movl 12(%ebp), %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: andl $508, %ecx # imm = 0x1FC -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 248(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 252(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 504(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 508(%esi), %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 120(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 124(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 376(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 380(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 184(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 188(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 440(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 444(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 312(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 316(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 216(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 220(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 472(%esi), %edi -; X86-NEXT: movl 476(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 92(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 344(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 348(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 152(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 156(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 408(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 412(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 280(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 284(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 232(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 236(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 488(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 492(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 104(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 108(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 360(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 364(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 168(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 172(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 424(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 428(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 296(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 300(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 200(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 204(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 456(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 460(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 76(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 328(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 332(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 136(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 140(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 392(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 396(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 264(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 268(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 240(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 244(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 496(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 500(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 112(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 116(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 368(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 372(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 176(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 180(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 432(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 436(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 304(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 308(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 208(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 212(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 464(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 468(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 84(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 336(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 340(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 144(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 148(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 400(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 404(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 272(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 276(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 224(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 228(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 480(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 484(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 100(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 352(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 356(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 160(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 164(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 416(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 420(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 288(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 292(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 192(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 196(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 448(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 452(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 320(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 324(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 128(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 132(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl 256(%esi), %edi -; X86-NEXT: movl 260(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 388(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 4(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $1, %eax, %edi -; X86-NEXT: shrl %eax -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: notb %cl -; X86-NEXT: shrdl %cl, %eax, %edi -; X86-NEXT: shrl %cl, %ebx -; X86-NEXT: movb $32, %cl -; X86-NEXT: testb %cl, %cl -; X86-NEXT: movl (%esi), %eax +; X86-NEXT: notl %edx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl (%ebx), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: jne .LBB20_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: .LBB20_2: -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: notl %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shll %cl, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 320(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 64(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 448(%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 192(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 288(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 32(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 416(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 160(%eax), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 352(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 96(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 480(%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 224(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 272(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 16(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 400(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 144(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 336(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 80(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 464(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 208(%eax), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 304(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 48(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 432(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 176(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 368(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 112(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 496(%eax), %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: andl 240(%eax), %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 264(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 8(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 392(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 136(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 328(%ebx), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 72(%ebx), %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 456(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 200(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 296(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 40(%ebx), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 424(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 168(%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 360(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 104(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 488(%ebx), %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 232(%ebx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 280(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 24(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 408(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 152(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 344(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 88(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 472(%ebx), %eax +; X86-NEXT: shldl %cl, %edi, %ebx +; X86-NEXT: notl %ebx +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, 24(%ecx) +; X86-NEXT: movl %esi, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 216(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 312(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 56(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 440(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 184(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 376(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 120(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 504(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 248(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 324(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 68(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 452(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 196(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 292(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 36(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 420(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 164(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 356(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 100(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 484(%ebx), %eax +; X86-NEXT: movl %esi, 16(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 228(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 276(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 20(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 404(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 148(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 340(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 84(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 468(%ebx), %eax +; X86-NEXT: movl %esi, 12(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 212(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %esi, 8(%ecx) +; X86-NEXT: movl %edi, 4(%ecx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 308(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 52(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 28(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 436(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 180(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %eax, 32(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 372(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 116(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 36(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 500(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 244(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, 40(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 268(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 12(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 44(%ecx) +; X86-NEXT: movl %edx, 48(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 396(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 140(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %eax, 52(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 332(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 76(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 56(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 460(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 204(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 300(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 44(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %eax, 60(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 428(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 172(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 364(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 108(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 492(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 236(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 284(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 28(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 412(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 156(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 348(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 92(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 476(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 220(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 316(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 60(%ebx), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 444(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 188(%ebx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 380(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 124(%ebx), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 508(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: andl 252(%esi), %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: negl %ecx -; X86-NEXT: movl 1648(%esp,%ecx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 128(%edx), %ecx -; X86-NEXT: andl 384(%edx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 256(%edx), %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 260(%edx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 4(%edx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 132(%edx), %eax -; X86-NEXT: andl 388(%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: setne %al ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -5483,1545 +1873,157 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: test_ne_i4096: +; SSE-LABEL: blsr_u512: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %r15 ; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $1576, %rsp # imm = 0x628 -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: andl $4032, %eax # imm = 0xFC0 -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %rsi -; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1304(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1560(%rsp,%rsi), %rax -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1176(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1432(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1240(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1496(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1112(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; SSE-NEXT: movq 1368(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1272(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1528(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1144(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1400(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1208(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1464(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1080(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1336(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1288(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1544(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1160(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1416(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1216(%rsp,%rsi), %r11 -; SSE-NEXT: movq 1224(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %r11, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1480(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1088(%rsp,%rsi), %r9 -; SSE-NEXT: movq 1096(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %r9, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1352(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1248(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1512(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1120(%rsp,%rsi), %rax -; SSE-NEXT: movq 1128(%rsp,%rsi), %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rax, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1376(%rsp,%rsi), %r13 -; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx -; SSE-NEXT: movq %rbx, %r8 -; SSE-NEXT: shldq %cl, %r13, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1192(%rsp,%rsi), %r15 -; SSE-NEXT: movq %r15, %r14 -; SSE-NEXT: shldq %cl, %rdx, %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1440(%rsp,%rsi), %r10 -; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, %r14 -; SSE-NEXT: shldq %cl, %r10, %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1312(%rsp,%rsi), %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp -; SSE-NEXT: movq %rbp, %r12 -; SSE-NEXT: shldq %cl, %r14, %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx -; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT: shldq %cl, %rbp, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r9 -; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r12, %r15 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r12, %r10 -; SSE-NEXT: andq 384(%rdi), %r10 -; SSE-NEXT: andq 128(%rdi), %r15 -; SSE-NEXT: andq 320(%rdi), %r13 -; SSE-NEXT: andq 64(%rdi), %rax -; SSE-NEXT: orq %r10, %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: andq 448(%rdi), %r9 -; SSE-NEXT: andq 192(%rdi), %rbp -; SSE-NEXT: orq %r9, %rbp -; SSE-NEXT: orq %rax, %rbp -; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq 288(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 32(%rdi), %r9 -; SSE-NEXT: andq 416(%rdi), %rdx -; SSE-NEXT: andq 160(%rdi), %r11 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 352(%rdi), %rdx -; SSE-NEXT: orq %r9, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 96(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 480(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 224(%rdi), %r8 -; SSE-NEXT: orq %rax, %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq 272(%rdi), %r14 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 16(%rdi), %rax -; SSE-NEXT: orq %r14, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 400(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 144(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 336(%rdi), %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 80(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 464(%rdi), %rdx -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 208(%rdi), %r11 -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: orq %rax, %r11 -; SSE-NEXT: orq %r8, %r11 -; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload -; SSE-NEXT: andq 304(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 48(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 432(%rdi), %r9 -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 176(%rdi), %r8 -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 368(%rdi), %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 112(%rdi), %rax -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: movq %r8, %r10 -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 496(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT: andq 240(%rdi), %rbp -; SSE-NEXT: orq %r8, %rbp -; SSE-NEXT: orq %rax, %rbp -; SSE-NEXT: orq %r10, %rbp -; SSE-NEXT: orq %r11, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 392(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: andq 136(%rdi), %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 328(%rdi), %rdx -; SSE-NEXT: orq %rax, %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 72(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 456(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE-NEXT: andq 200(%rdi), %r13 -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: orq %rdx, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 296(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 40(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 424(%rdi), %r8 -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 168(%rdi), %rdx -; SSE-NEXT: orq %r8, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 360(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 104(%rdi), %rax -; SSE-NEXT: orq %r9, %rdx -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 488(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: andq 232(%rdi), %r15 -; SSE-NEXT: orq %rax, %r15 -; SSE-NEXT: orq %r8, %r15 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 280(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: orq %rdx, %r15 -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 408(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 152(%rdi), %rax -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 344(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 88(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 472(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT: andq 216(%rdi), %r14 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: orq %rax, %r14 -; SSE-NEXT: orq %r8, %r14 -; SSE-NEXT: orq %r10, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 312(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 440(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 184(%rdi), %r9 -; SSE-NEXT: orq %r11, %r10 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: orq %r10, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: andq 376(%rdi), %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 120(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 504(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 248(%rdi), %r8 -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: movq 1056(%rsp,%rsi), %rax -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: andq 256(%rdi), %rdx -; SSE-NEXT: orq %r14, %r8 -; SSE-NEXT: andq (%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq %rbp, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT: andq 264(%rdi), %rcx -; SSE-NEXT: andq 8(%rdi), %rbx -; SSE-NEXT: orq %rcx, %rbx -; SSE-NEXT: orq %r12, %rbx -; SSE-NEXT: orq %r13, %rbx -; SSE-NEXT: orq %r15, %rbx -; SSE-NEXT: orq %r8, %rbx -; SSE-NEXT: orq %rax, %rbx -; SSE-NEXT: setne %al -; SSE-NEXT: addq $1576, %rsp # imm = 0x628 +; SSE-NEXT: movq 48(%rdi), %r11 +; SSE-NEXT: movq 40(%rdi), %r9 +; SSE-NEXT: movq 24(%rdi), %r8 +; SSE-NEXT: movq 16(%rdi), %rdx +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %rsi +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: rep bsfq %rsi, %rbx +; SSE-NEXT: addq $64, %rbx +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovneq %rax, %rbx +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %r8, %r10 +; SSE-NEXT: addq $64, %r10 +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovneq %rax, %r10 +; SSE-NEXT: movq 32(%rdi), %r14 +; SSE-NEXT: subq $-128, %r10 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: cmovneq %rbx, %r10 +; SSE-NEXT: rep bsfq %r14, %rax +; SSE-NEXT: rep bsfq %r9, %rbx +; SSE-NEXT: addq $64, %rbx +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovneq %rax, %rbx +; SSE-NEXT: rep bsfq %r11, %r15 +; SSE-NEXT: movl $64, %eax +; SSE-NEXT: rep bsfq 56(%rdi), %rax +; SSE-NEXT: addq $64, %rax +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovneq %r15, %rax +; SSE-NEXT: subq $-128, %rax +; SSE-NEXT: orq %r9, %r14 +; SSE-NEXT: cmovneq %rbx, %rax +; SSE-NEXT: addq $256, %rax # imm = 0x100 +; SSE-NEXT: orq %r8, %rsi +; SSE-NEXT: orq %rdx, %rcx +; SSE-NEXT: orq %rsi, %rcx +; SSE-NEXT: cmovneq %r10, %rax +; SSE-NEXT: movl $-2, %edx +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: roll %cl, %edx +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: andl $60, %ecx +; SSE-NEXT: andl %edx, (%rdi,%rcx) +; SSE-NEXT: # kill: def $eax killed $eax killed $rax ; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 ; SSE-NEXT: popq %r14 ; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; AVX2-LABEL: test_ne_i4096: +; AVX2-LABEL: blsr_u512: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %rsi -; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11 -; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %r11, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12 -; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %r12, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp -; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax -; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10 -; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8 -; AVX2-NEXT: movq %r8, %rdx -; AVX2-NEXT: shldq %cl, %r10, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx -; AVX2-NEXT: movq %rbx, %rdx -; AVX2-NEXT: shldq %cl, %r9, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9 -; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, %r14 -; AVX2-NEXT: shldq %cl, %r9, %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14 -; AVX2-NEXT: movq %r14, %r13 -; AVX2-NEXT: shldq %cl, %r15, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx -; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, %r9 -; AVX2-NEXT: andq 384(%rdi), %r9 -; AVX2-NEXT: andq 128(%rdi), %r14 -; AVX2-NEXT: andq 320(%rdi), %r10 -; AVX2-NEXT: orq %r9, %r14 -; AVX2-NEXT: movq %r14, %r15 -; AVX2-NEXT: andq 64(%rdi), %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: andq 448(%rdi), %rbp -; AVX2-NEXT: andq 192(%rdi), %r13 -; AVX2-NEXT: orq %rbp, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq 288(%rdi), %r8 -; AVX2-NEXT: andq 32(%rdi), %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 416(%rdi), %rax -; AVX2-NEXT: orq %r8, %r12 -; AVX2-NEXT: andq 160(%rdi), %r11 -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: andq 352(%rdi), %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 96(%rdi), %rax -; AVX2-NEXT: orq %r12, %r11 -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 480(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: andq 224(%rdi), %r13 -; AVX2-NEXT: orq %r10, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 272(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 16(%rdi), %rax -; AVX2-NEXT: orq %r11, %r13 -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 400(%rdi), %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 144(%rdi), %rax -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 336(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 80(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 464(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 208(%rdi), %r11 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: orq %r8, %r11 -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: orq %r9, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 304(%rdi), %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 48(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 432(%rdi), %r10 -; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload -; AVX2-NEXT: andq 176(%rdi), %rax -; AVX2-NEXT: orq %r9, %r8 -; AVX2-NEXT: movq %r8, %r9 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 368(%rdi), %r8 -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 112(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 496(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 240(%rdi), %r9 -; AVX2-NEXT: orq %r8, %r9 -; AVX2-NEXT: orq %rax, %r9 -; AVX2-NEXT: orq %r10, %r9 -; AVX2-NEXT: orq %r11, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 392(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT: andq 136(%rdi), %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 328(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 72(%rdi), %rax -; AVX2-NEXT: orq %r10, %rbp -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 456(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX2-NEXT: andq 200(%rdi), %r12 -; AVX2-NEXT: orq %rax, %r12 -; AVX2-NEXT: orq %r8, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 296(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 40(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 424(%rdi), %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 168(%rdi), %rax -; AVX2-NEXT: orq %r10, %r8 -; AVX2-NEXT: movq %r8, %r10 -; AVX2-NEXT: orq %r11, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 360(%rdi), %r8 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 104(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 488(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: andq 232(%rdi), %r14 -; AVX2-NEXT: orq %rax, %r14 -; AVX2-NEXT: orq %r8, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 280(%rdi), %r8 -; AVX2-NEXT: orq %r10, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 24(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 408(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 152(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 344(%rdi), %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 88(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 472(%rdi), %rax -; AVX2-NEXT: orq %r11, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: andq 216(%rdi), %rbx -; AVX2-NEXT: orq %rax, %rbx -; AVX2-NEXT: orq %r8, %rbx -; AVX2-NEXT: orq %r10, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 312(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 56(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 440(%rdi), %r10 -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 184(%rdi), %r8 -; AVX2-NEXT: orq %r10, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 376(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 120(%rdi), %rax -; AVX2-NEXT: orq %r11, %r8 -; AVX2-NEXT: movq %r8, %r11 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 504(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 248(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r8, %r10 -; AVX2-NEXT: orq %r11, %rax -; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: shlxq %rcx, %rsi, %rax -; AVX2-NEXT: andq 256(%rdi), %r10 -; AVX2-NEXT: andq (%rdi), %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT: orq %r13, %rax -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andq 264(%rdi), %rcx -; AVX2-NEXT: andq 8(%rdi), %rdx -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: orq %rbp, %rdx -; AVX2-NEXT: orq %r12, %rdx -; AVX2-NEXT: orq %r14, %rdx -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX2-NEXT: movq 40(%rdi), %r9 +; AVX2-NEXT: movq 32(%rdi), %r10 +; AVX2-NEXT: movq 24(%rdi), %r8 +; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 8(%rdi), %rsi +; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %rsi, %rbx +; AVX2-NEXT: addq $64, %rbx +; AVX2-NEXT: testq %rcx, %rcx +; AVX2-NEXT: cmovneq %rax, %rbx +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: tzcntq %r8, %r11 +; AVX2-NEXT: addq $64, %r11 +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovneq %rax, %r11 +; AVX2-NEXT: subq $-128, %r11 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: orq %rsi, %rax +; AVX2-NEXT: cmovneq %rbx, %r11 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq %r10, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %r9, %rbx +; AVX2-NEXT: addq $64, %rbx +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovneq %rax, %rbx +; AVX2-NEXT: movq 48(%rdi), %r14 +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %r14, %r15 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 56(%rdi), %rax +; AVX2-NEXT: addq $64, %rax +; AVX2-NEXT: testq %r14, %r14 +; AVX2-NEXT: cmovneq %r15, %rax +; AVX2-NEXT: subq $-128, %rax +; AVX2-NEXT: orq %r9, %r10 +; AVX2-NEXT: cmovneq %rbx, %rax +; AVX2-NEXT: addq $256, %rax # imm = 0x100 +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: cmovneq %r11, %rax +; AVX2-NEXT: movl $-2, %edx +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: roll %cl, %edx +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $3, %ecx +; AVX2-NEXT: andl $60, %ecx +; AVX2-NEXT: andl %edx, (%rdi,%rcx) +; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_ne_i4096: +; AVX512-LABEL: blsr_u512: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0 -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: negl %eax -; AVX512-NEXT: movslq %eax, %rsi -; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10 -; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14 -; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12 -; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r12, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax -; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11 -; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx -; AVX512-NEXT: movq %rbx, %rdx -; AVX512-NEXT: shldq %cl, %r11, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8 -; AVX512-NEXT: movq %r8, %rdx -; AVX512-NEXT: shldq %cl, %r9, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9 -; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, %r15 -; AVX512-NEXT: shldq %cl, %r9, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp -; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15 -; AVX512-NEXT: movq %r15, %r13 -; AVX512-NEXT: shldq %cl, %rbp, %r13 -; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx -; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbp, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: andq 384(%rdi), %r9 -; AVX512-NEXT: andq 128(%rdi), %r15 -; AVX512-NEXT: orq %r9, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq 320(%rdi), %r11 -; AVX512-NEXT: andq 64(%rdi), %rax -; AVX512-NEXT: orq %r11, %rax -; AVX512-NEXT: andq 448(%rdi), %r12 -; AVX512-NEXT: andq 192(%rdi), %r13 -; AVX512-NEXT: orq %r12, %r13 -; AVX512-NEXT: orq %rax, %r13 -; AVX512-NEXT: andq 288(%rdi), %r8 -; AVX512-NEXT: andq 32(%rdi), %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 416(%rdi), %rax -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: andq 160(%rdi), %r10 -; AVX512-NEXT: orq %rax, %r10 -; AVX512-NEXT: andq 352(%rdi), %rbx -; AVX512-NEXT: orq %r14, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 96(%rdi), %rax -; AVX512-NEXT: orq %rbx, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 480(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: andq 224(%rdi), %r15 -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: orq %r8, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 272(%rdi), %r8 -; AVX512-NEXT: orq %r10, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 16(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 400(%rdi), %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 144(%rdi), %rax -; AVX512-NEXT: orq %r9, %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 336(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 80(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 464(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: andq 208(%rdi), %r11 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: orq %r8, %r11 -; AVX512-NEXT: orq %rax, %r11 -; AVX512-NEXT: orq %r9, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 304(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 48(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 432(%rdi), %r9 -; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload -; AVX512-NEXT: andq 176(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: orq %r9, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 368(%rdi), %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 112(%rdi), %rax -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: movq %r8, %r10 -; AVX512-NEXT: orq %r9, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 496(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 240(%rdi), %r9 -; AVX512-NEXT: orq %r8, %r9 -; AVX512-NEXT: orq %rax, %r9 -; AVX512-NEXT: orq %r10, %r9 -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 392(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: andq 136(%rdi), %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 328(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 72(%rdi), %rax -; AVX512-NEXT: orq %r10, %rbp -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 456(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX512-NEXT: andq 200(%rdi), %r12 -; AVX512-NEXT: orq %rax, %r12 -; AVX512-NEXT: orq %r8, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 296(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 40(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 424(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 168(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 360(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 104(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 488(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: andq 232(%rdi), %r14 -; AVX512-NEXT: orq %rax, %r14 -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: orq %r10, %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 280(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 24(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 408(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 152(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: andq 344(%rdi), %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 88(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 472(%rdi), %rax -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: andq 216(%rdi), %rbx -; AVX512-NEXT: orq %rax, %rbx -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: orq %r10, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 312(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 56(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 440(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 184(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 376(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 120(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 504(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 248(%rdi), %r8 -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rsi, %r10 -; AVX512-NEXT: orq %rbx, %r8 -; AVX512-NEXT: shlxq %rcx, %rax, %rsi -; AVX512-NEXT: andq 256(%rdi), %r10 -; AVX512-NEXT: andq (%rdi), %rsi -; AVX512-NEXT: orq %r10, %rsi -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: orq %r13, %rsi -; AVX512-NEXT: orq %r15, %rsi -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 264(%rdi), %rax -; AVX512-NEXT: andq 8(%rdi), %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: orq %rbp, %rdx -; AVX512-NEXT: orq %r12, %rdx -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: orq %r8, %rdx -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $1560, %rsp # imm = 0x618 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: movl $-2, %edx +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: roll %cl, %edx +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: shrl $3, %ecx +; AVX512-NEXT: andl $60, %ecx +; AVX512-NEXT: andl %edx, (%rdi,%rcx) +; AVX512-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq - %rem = and i32 %position, 4095 - %ofs = zext nneg i32 %rem to i4096 - %bit = shl nuw i4096 1, %ofs - %ld = load i4096, ptr %word - %test = and i4096 %ld, %bit - %cmp = icmp ne i4096 %test, 0 - ret i1 %cmp + %ld = load i512, ptr %word + %tz = tail call range(i512 0, 513) i512 @llvm.cttz.i512(i512 %ld, i1 false) + %tz.cast = trunc nuw nsw i512 %tz to i32 + %tz.mask = and i512 %tz, 511 + %mask = shl nuw i512 1, %tz.mask + %mask.not = xor i512 %mask, -1 + %blsr = and i512 %ld, %mask.not + store i512 %blsr, ptr %word + ret i32 %tz.cast } diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll index e2db8d4..59eb776 100644 --- a/llvm/test/CodeGen/X86/build-vector-128.ll +++ b/llvm/test/CodeGen/X86/build-vector-128.ll @@ -410,6 +410,234 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ret <16 x i8> %ins15 } +; build vectors where integers operands are split (typically via legalization) + +define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind { +; SSE-32-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: test_buildvector_v2i64_split_v4i32: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movq %rsi, %xmm1 +; SSE-64-NEXT: movq %rdi, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v2i64_split_v4i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v2i64_split_v4i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovq %rsi, %xmm0 +; AVX-64-NEXT: vmovq %rdi, %xmm1 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: retq + %a0.lo = trunc i64 %a0 to i32 + %a1.lo = trunc i64 %a1 to i32 + %a0.shr = lshr i64 %a0, 32 + %a1.shr = lshr i64 %a1, 32 + %a0.hi = trunc i64 %a0.shr to i32 + %a1.hi = trunc i64 %a1.shr to i32 + %v0 = insertelement <4 x i32> poison, i32 %a0.lo, i64 0 + %v1 = insertelement <4 x i32> %v0, i32 %a0.hi, i64 1 + %v2 = insertelement <4 x i32> %v1, i32 %a1.lo, i64 2 + %v3 = insertelement <4 x i32> %v2, i32 %a1.hi, i64 3 + ret <4 x i32> %v3 +} + +define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind { +; SSE-32-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; SSE-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %edx, %xmm1 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-64-NEXT: movd %esi, %xmm2 +; SSE2-64-NEXT: movd %edi, %xmm0 +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-64-NEXT: retq +; +; SSE41-64-LABEL: test_buildvector_v4i32_split_v8i16: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movd %edi, %xmm0 +; SSE41-64-NEXT: pinsrd $1, %esi, %xmm0 +; SSE41-64-NEXT: pinsrd $2, %edx, %xmm0 +; SSE41-64-NEXT: pinsrd $3, %ecx, %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v4i32_split_v8i16: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v4i32_split_v8i16: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i32 %a0 to i16 + %a1.lo = trunc i32 %a1 to i16 + %a2.lo = trunc i32 %a2 to i16 + %a3.lo = trunc i32 %a3 to i16 + %a0.shr = lshr i32 %a0, 16 + %a1.shr = lshr i32 %a1, 16 + %a2.shr = lshr i32 %a2, 16 + %a3.shr = lshr i32 %a3, 16 + %a0.hi = trunc i32 %a0.shr to i16 + %a1.hi = trunc i32 %a1.shr to i16 + %a2.hi = trunc i32 %a2.shr to i16 + %a3.hi = trunc i32 %a3.shr to i16 + %v0 = insertelement <8 x i16> poison, i16 %a0.lo, i64 0 + %v1 = insertelement <8 x i16> %v0, i16 %a0.hi, i64 1 + %v2 = insertelement <8 x i16> %v1, i16 %a1.lo, i64 2 + %v3 = insertelement <8 x i16> %v2, i16 %a1.hi, i64 3 + %v4 = insertelement <8 x i16> %v3, i16 %a2.lo, i64 4 + %v5 = insertelement <8 x i16> %v4, i16 %a2.hi, i64 5 + %v6 = insertelement <8 x i16> %v5, i16 %a3.lo, i64 6 + %v7 = insertelement <8 x i16> %v6, i16 %a3.hi, i64 7 + ret <8 x i16> %v7 +} + +define <16 x i8> @test_buildvector_v8i16_split_v16i8(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind { +; SSE2-32-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE2-32: # %bb.0: +; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-32-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-32-NEXT: retl +; +; SSE2-64-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE2-64: # %bb.0: +; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-64-NEXT: movd %r9d, %xmm0 +; SSE2-64-NEXT: movd %r8d, %xmm2 +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %edx, %xmm1 +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-64-NEXT: movd %esi, %xmm3 +; SSE2-64-NEXT: movd %edi, %xmm0 +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-64-NEXT: retq +; +; SSE41-32-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE41-32: # %bb.0: +; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-32-NEXT: pinsrw $1, {{[0-9]+}}(%esp), %xmm0 +; SSE41-32-NEXT: pinsrw $2, {{[0-9]+}}(%esp), %xmm0 +; SSE41-32-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0 +; SSE41-32-NEXT: pinsrw $4, {{[0-9]+}}(%esp), %xmm0 +; SSE41-32-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0 +; SSE41-32-NEXT: pinsrw $6, {{[0-9]+}}(%esp), %xmm0 +; SSE41-32-NEXT: pinsrw $7, {{[0-9]+}}(%esp), %xmm0 +; SSE41-32-NEXT: retl +; +; SSE41-64-LABEL: test_buildvector_v8i16_split_v16i8: +; SSE41-64: # %bb.0: +; SSE41-64-NEXT: movd %edi, %xmm0 +; SSE41-64-NEXT: pinsrw $1, %esi, %xmm0 +; SSE41-64-NEXT: pinsrw $2, %edx, %xmm0 +; SSE41-64-NEXT: pinsrw $3, %ecx, %xmm0 +; SSE41-64-NEXT: pinsrw $4, %r8d, %xmm0 +; SSE41-64-NEXT: pinsrw $5, %r9d, %xmm0 +; SSE41-64-NEXT: pinsrw $6, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-64-NEXT: pinsrw $7, {{[0-9]+}}(%rsp), %xmm0 +; SSE41-64-NEXT: retq +; +; AVX-32-LABEL: test_buildvector_v8i16_split_v16i8: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_buildvector_v8i16_split_v16i8: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd %edi, %xmm0 +; AVX-64-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: retq + %a0.lo = trunc i16 %a0 to i8 + %a1.lo = trunc i16 %a1 to i8 + %a2.lo = trunc i16 %a2 to i8 + %a3.lo = trunc i16 %a3 to i8 + %a4.lo = trunc i16 %a4 to i8 + %a5.lo = trunc i16 %a5 to i8 + %a6.lo = trunc i16 %a6 to i8 + %a7.lo = trunc i16 %a7 to i8 + %a0.shr = lshr i16 %a0, 8 + %a1.shr = lshr i16 %a1, 8 + %a2.shr = lshr i16 %a2, 8 + %a3.shr = lshr i16 %a3, 8 + %a4.shr = lshr i16 %a4, 8 + %a5.shr = lshr i16 %a5, 8 + %a6.shr = lshr i16 %a6, 8 + %a7.shr = lshr i16 %a7, 8 + %a0.hi = trunc i16 %a0.shr to i8 + %a1.hi = trunc i16 %a1.shr to i8 + %a2.hi = trunc i16 %a2.shr to i8 + %a3.hi = trunc i16 %a3.shr to i8 + %a4.hi = trunc i16 %a4.shr to i8 + %a5.hi = trunc i16 %a5.shr to i8 + %a6.hi = trunc i16 %a6.shr to i8 + %a7.hi = trunc i16 %a7.shr to i8 + %v0 = insertelement <16 x i8> poison, i8 %a0.lo, i64 0 + %v1 = insertelement <16 x i8> %v0, i8 %a0.hi, i64 1 + %v2 = insertelement <16 x i8> %v1, i8 %a1.lo, i64 2 + %v3 = insertelement <16 x i8> %v2, i8 %a1.hi, i64 3 + %v4 = insertelement <16 x i8> %v3, i8 %a2.lo, i64 4 + %v5 = insertelement <16 x i8> %v4, i8 %a2.hi, i64 5 + %v6 = insertelement <16 x i8> %v5, i8 %a3.lo, i64 6 + %v7 = insertelement <16 x i8> %v6, i8 %a3.hi, i64 7 + %v8 = insertelement <16 x i8> %v7, i8 %a4.lo, i64 8 + %v9 = insertelement <16 x i8> %v8, i8 %a4.hi, i64 9 + %v10 = insertelement <16 x i8> %v9, i8 %a5.lo, i64 10 + %v11 = insertelement <16 x i8> %v10, i8 %a5.hi, i64 11 + %v12 = insertelement <16 x i8> %v11, i8 %a6.lo, i64 12 + %v13 = insertelement <16 x i8> %v12, i8 %a6.hi, i64 13 + %v14 = insertelement <16 x i8> %v13, i8 %a7.lo, i64 14 + %v15 = insertelement <16 x i8> %v14, i8 %a7.hi, i64 15 + ret <16 x i8> %v15 +} + ; build vectors of repeated elements define <4 x float> @test_buildvector_4f32_2_var(float %a0, float %a1) { diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll index 3edb712..773eb8f 100644 --- a/llvm/test/CodeGen/X86/build-vector-256.ll +++ b/llvm/test/CodeGen/X86/build-vector-256.ll @@ -417,9 +417,8 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) { ; AVX1-32-LABEL: test_buildvector_4f64_2_var: ; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX1-32-NEXT: vmovupd {{[0-9]+}}(%esp), %xmm0 +; AVX1-32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll index f36baba..ab8498d 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll @@ -14,7 +14,6 @@ entry: } ; CHECK: _ZL10myCallbacki: -; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define internal void @_ZL10myCallbacki(i32 %value) !type !2 { entry: %sink = alloca i32, align 4 @@ -33,6 +32,6 @@ entry: ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. ; CHECK-NEXT: .byte 1 ;; Function Entry PC -; CHECK-NEXT: .quad [[LABEL_FUNC]] +; CHECK-NEXT: .quad _ZL10myCallbacki ;; Function type ID ; CHECK-NEXT: .quad -5212364466660467813 diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll index cdbad66..02d7107 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll @@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8) declare !type !2 ptr @direct_baz(ptr) ; CHECK: ball: -; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define ptr @ball() { entry: call void @direct_foo() @@ -42,7 +41,7 @@ entry: ;; Flags ; CHECK-NEXT: .byte 7 ;; Function Entry PC -; CHECK-NEXT: .quad [[LABEL_FUNC]] +; CHECK-NEXT: .quad ball ;; Function type ID -- set to 0 as no type metadata attached to function. ; CHECK-NEXT: .quad 0 ;; Number of unique direct callees. diff --git a/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir b/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir index ef9fb22..8211f89 100644 --- a/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir +++ b/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir @@ -1,4 +1,3 @@ -# REQUIRES: asserts # RUN: not --crash llc -o - %s -mtriple=x86_64-- \ # RUN: -run-pass=cfi-instr-inserter 2>&1 | FileCheck %s # Test that CSR being saved in multiple locations can be caught by @@ -10,8 +9,7 @@ } ... --- -# CHECK: Different saved locations for the same CSR -# CHECK-NEXT: UNREACHABLE executed +# CHECK: LLVM ERROR: Different saved locations for the same CSR name: inconsistentlocs body: | bb.0: diff --git a/llvm/test/CodeGen/X86/chain_order.ll b/llvm/test/CodeGen/X86/chain_order.ll index 3ced27f..18faec5 100644 --- a/llvm/test/CodeGen/X86/chain_order.ll +++ b/llvm/test/CodeGen/X86/chain_order.ll @@ -6,9 +6,8 @@ define void @cftx020(ptr nocapture %a) { ; CHECK-LABEL: cftx020: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovupd (%rdi), %xmm1 ; CHECK-NEXT: vmovupd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll index 4d41c84..a42a715 100644 --- a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll +++ b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll @@ -7,8 +7,8 @@ define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0_(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0_: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4 @@ -68,8 +68,8 @@ _ZNK4llvm5APInt13getActiveBitsEv.exit.i.i: ; preds = %for.body.i.i.i.i.i define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0__assert(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0__assert: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB1_1 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/combine-fceil.ll b/llvm/test/CodeGen/X86/combine-fceil.ll new file mode 100644 index 0000000..a3f55e8 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-fceil.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_ceil_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_ceil_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $10, %xmm0, %xmm0 +; SSE-NEXT: roundpd $10, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_ceil_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_ceil_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_ceil_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $10, %xmm0, %xmm0 +; SSE-NEXT: roundps $10, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_ceil_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_ceil_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_ceil_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $10, %xmm0, %xmm0 +; SSE-NEXT: roundpd $10, %xmm1, %xmm1 +; SSE-NEXT: roundpd $10, %xmm2, %xmm2 +; SSE-NEXT: roundpd $10, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_ceil_v8f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $10, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundpd $10, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_ceil_v8f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $10, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundpd $10, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_ceil_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_ceil_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_ceil_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $10, %xmm0, %xmm0 +; SSE-NEXT: roundps $10, %xmm1, %xmm1 +; SSE-NEXT: roundps $10, %xmm2, %xmm2 +; SSE-NEXT: roundps $10, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_ceil_v16f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundps $10, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_ceil_v16f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundps $10, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_ceil_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_ceil_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_ceil_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $10, %xmm0, %xmm0 +; SSE-NEXT: roundpd $10, %xmm1, %xmm1 +; SSE-NEXT: roundpd $10, %xmm2, %xmm2 +; SSE-NEXT: roundpd $10, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_ceil_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $10, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $10, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_ceil_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_ceil_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_ceil_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $10, %xmm0, %xmm0 +; SSE-NEXT: roundps $10, %xmm1, %xmm1 +; SSE-NEXT: roundps $10, %xmm2, %xmm2 +; SSE-NEXT: roundps $10, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_ceil_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $10, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_ceil_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-fcmp.ll b/llvm/test/CodeGen/X86/combine-fcmp.ll new file mode 100644 index 0000000..f2666f6 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-fcmp.ll @@ -0,0 +1,330 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 + +define i4 @concat_fcmp_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_fcmp_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: xorpd %xmm2, %xmm2 +; SSE-NEXT: xorpd %xmm3, %xmm3 +; SSE-NEXT: cmpltpd %xmm0, %xmm3 +; SSE-NEXT: cmpltpd %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE-NEXT: movmskps %xmm3, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_fcmp_v4f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1OR2-NEXT: vcmpltpd %xmm0, %xmm2, %xmm0 +; AVX1OR2-NEXT: vcmpltpd %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1OR2-NEXT: vmovmskps %xmm0, %eax +; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_fcmp_v4f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vcmpltpd %xmm0, %xmm2, %k0 +; AVX512-NEXT: vcmpltpd %xmm1, %xmm2, %k1 +; AVX512-NEXT: kshiftlb $2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %v0 = fcmp ogt <2 x double> %a0, zeroinitializer + %v1 = fcmp ogt <2 x double> %a1, zeroinitializer + %v = shufflevector <2 x i1> %v0, <2 x i1> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r = bitcast <4 x i1> %v to i4 + ret i4 %r +} + +define i8 @concat_fcmp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_fcmp_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: cmpeqps %xmm2, %xmm0 +; SSE-NEXT: cmpeqps %xmm2, %xmm1 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_fcmp_v8f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1OR2-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0 +; AVX1OR2-NEXT: vcmpeqps %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_fcmp_v8f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = fcmp oeq <4 x float> %a0, zeroinitializer + %v1 = fcmp oeq <4 x float> %a1, zeroinitializer + %v = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r = bitcast <8 x i1> %v to i8 + ret i8 %r +} + +define i8 @concat_fcmp_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_fcmp_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: xorpd %xmm4, %xmm4 +; SSE-NEXT: cmpltpd %xmm4, %xmm0 +; SSE-NEXT: cmpltpd %xmm4, %xmm1 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: cmpltpd %xmm4, %xmm2 +; SSE-NEXT: cmpltpd %xmm4, %xmm3 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm2 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_fcmp_v8f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vcmpltpd %xmm4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vcmpltpd %xmm4, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vcmpltpd %xmm4, %xmm2, %xmm1 +; AVX1OR2-NEXT: vcmpltpd %xmm4, %xmm3, %xmm2 +; AVX1OR2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_fcmp_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpltpd %zmm1, %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = fcmp olt <2 x double> %a0, zeroinitializer + %v1 = fcmp olt <2 x double> %a1, zeroinitializer + %v2 = fcmp olt <2 x double> %a2, zeroinitializer + %v3 = fcmp olt <2 x double> %a3, zeroinitializer + %v01 = shufflevector <2 x i1> %v0, <2 x i1> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %v23 = shufflevector <2 x i1> %v2, <2 x i1> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %v = shufflevector <4 x i1> %v01, <4 x i1> %v23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r = bitcast <8 x i1> %v to i8 + ret i8 %r +} + +define i16 @concat_fcmp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_fcmp_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm4, %xmm4 +; SSE-NEXT: xorps %xmm5, %xmm5 +; SSE-NEXT: cmpleps %xmm0, %xmm5 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cmpleps %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm0, %xmm5 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cmpleps %xmm2, %xmm0 +; SSE-NEXT: cmpleps %xmm3, %xmm4 +; SSE-NEXT: packssdw %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm5 +; SSE-NEXT: pmovmskb %xmm5, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_fcmp_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vcmpleps %xmm0, %xmm4, %xmm0 +; AVX1OR2-NEXT: vcmpleps %xmm1, %xmm4, %xmm1 +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vcmpleps %xmm2, %xmm4, %xmm1 +; AVX1OR2-NEXT: vcmpleps %xmm3, %xmm4, %xmm2 +; AVX1OR2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_fcmp_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpleps %zmm0, %zmm1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = fcmp oge <4 x float> %a0, zeroinitializer + %v1 = fcmp oge <4 x float> %a1, zeroinitializer + %v2 = fcmp oge <4 x float> %a2, zeroinitializer + %v3 = fcmp oge <4 x float> %a3, zeroinitializer + %v01 = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v23 = shufflevector <4 x i1> %v2, <4 x i1> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v = shufflevector <8 x i1> %v01, <8 x i1> %v23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %r = bitcast <16 x i1> %v to i16 + ret i16 %r +} + +define i8 @concat_fcmp_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_fcmp_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: xorpd %xmm4, %xmm4 +; SSE-NEXT: movapd %xmm1, %xmm5 +; SSE-NEXT: cmpneqpd %xmm4, %xmm5 +; SSE-NEXT: cmpordpd %xmm4, %xmm1 +; SSE-NEXT: andpd %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: cmpneqpd %xmm4, %xmm5 +; SSE-NEXT: cmpordpd %xmm4, %xmm0 +; SSE-NEXT: andpd %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movapd %xmm3, %xmm1 +; SSE-NEXT: cmpneqpd %xmm4, %xmm1 +; SSE-NEXT: cmpordpd %xmm4, %xmm3 +; SSE-NEXT: andpd %xmm1, %xmm3 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: cmpneqpd %xmm4, %xmm1 +; SSE-NEXT: cmpordpd %xmm4, %xmm2 +; SSE-NEXT: andpd %xmm1, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_fcmp_v8f64_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneq_oqpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX1-NEXT: vcmpneq_oqpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_fcmp_v8f64_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcmpneq_oqpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX2-NEXT: vcmpneq_oqpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_fcmp_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = fcmp one <4 x double> %a0, zeroinitializer + %v1 = fcmp one <4 x double> %a1, zeroinitializer + %v = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r = bitcast <8 x i1> %v to i8 + ret i8 %r +} + +define i16 @concat_fcmp_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_fcmp_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm4, %xmm4 +; SSE-NEXT: cmpleps %xmm4, %xmm1 +; SSE-NEXT: cmpleps %xmm4, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: cmpleps %xmm4, %xmm3 +; SSE-NEXT: cmpleps %xmm4, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_fcmp_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1OR2-NEXT: vcmpleps %ymm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1OR2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1OR2-NEXT: vcmpleps %ymm2, %ymm1, %ymm1 +; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1OR2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_fcmp_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpleps %zmm1, %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = fcmp ole <8 x float> %a0, zeroinitializer + %v1 = fcmp ole <8 x float> %a1, zeroinitializer + %v = shufflevector <8 x i1> %v0, <8 x i1> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %r = bitcast <16 x i1> %v to i16 + ret i16 %r +} diff --git a/llvm/test/CodeGen/X86/combine-ffloor.ll b/llvm/test/CodeGen/X86/combine-ffloor.ll new file mode 100644 index 0000000..5cde95e --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-ffloor.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_floor_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_floor_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $9, %xmm0, %xmm0 +; SSE-NEXT: roundpd $9, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_floor_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundpd $9, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_floor_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_floor_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $9, %xmm0, %xmm0 +; SSE-NEXT: roundps $9, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_floor_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundps $9, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_floor_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_floor_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $9, %xmm0, %xmm0 +; SSE-NEXT: roundpd $9, %xmm1, %xmm1 +; SSE-NEXT: roundpd $9, %xmm2, %xmm2 +; SSE-NEXT: roundpd $9, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_floor_v8f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $9, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundpd $9, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_floor_v8f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $9, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundpd $9, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_floor_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_floor_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_floor_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $9, %xmm0, %xmm0 +; SSE-NEXT: roundps $9, %xmm1, %xmm1 +; SSE-NEXT: roundps $9, %xmm2, %xmm2 +; SSE-NEXT: roundps $9, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_floor_v16f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $9, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundps $9, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_floor_v16f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $9, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundps $9, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_floor_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_floor_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_floor_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $9, %xmm0, %xmm0 +; SSE-NEXT: roundpd $9, %xmm1, %xmm1 +; SSE-NEXT: roundpd $9, %xmm2, %xmm2 +; SSE-NEXT: roundpd $9, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_floor_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $9, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $9, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_floor_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.floor.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.floor.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_floor_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_floor_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $9, %xmm0, %xmm0 +; SSE-NEXT: roundps $9, %xmm1, %xmm1 +; SSE-NEXT: roundps $9, %xmm2, %xmm2 +; SSE-NEXT: roundps $9, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_floor_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $9, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $9, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_floor_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.floor.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.floor.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-fnearbyint.ll b/llvm/test/CodeGen/X86/combine-fnearbyint.ll new file mode 100644 index 0000000..fde136a --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-fnearbyint.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_nearbyint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_nearbyint_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $12, %xmm0, %xmm0 +; SSE-NEXT: roundpd $12, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_nearbyint_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundpd $12, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_nearbyint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_nearbyint_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $12, %xmm0, %xmm0 +; SSE-NEXT: roundps $12, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_nearbyint_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundps $12, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_nearbyint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_nearbyint_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $12, %xmm0, %xmm0 +; SSE-NEXT: roundpd $12, %xmm1, %xmm1 +; SSE-NEXT: roundpd $12, %xmm2, %xmm2 +; SSE-NEXT: roundpd $12, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_nearbyint_v8f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $12, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundpd $12, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_nearbyint_v8f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $12, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundpd $12, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_nearbyint_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_nearbyint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_nearbyint_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $12, %xmm0, %xmm0 +; SSE-NEXT: roundps $12, %xmm1, %xmm1 +; SSE-NEXT: roundps $12, %xmm2, %xmm2 +; SSE-NEXT: roundps $12, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_nearbyint_v16f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $12, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundps $12, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_nearbyint_v16f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $12, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundps $12, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_nearbyint_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_nearbyint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_nearbyint_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $12, %xmm0, %xmm0 +; SSE-NEXT: roundpd $12, %xmm1, %xmm1 +; SSE-NEXT: roundpd $12, %xmm2, %xmm2 +; SSE-NEXT: roundpd $12, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_nearbyint_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $12, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $12, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_nearbyint_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_nearbyint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_nearbyint_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $12, %xmm0, %xmm0 +; SSE-NEXT: roundps $12, %xmm1, %xmm1 +; SSE-NEXT: roundps $12, %xmm2, %xmm2 +; SSE-NEXT: roundps $12, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_nearbyint_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $12, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $12, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_nearbyint_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-frint.ll b/llvm/test/CodeGen/X86/combine-frint.ll new file mode 100644 index 0000000..1c52529 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-frint.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_rint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_rint_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $4, %xmm0, %xmm0 +; SSE-NEXT: roundpd $4, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_rint_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_rint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_rint_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $4, %xmm0, %xmm0 +; SSE-NEXT: roundps $4, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_rint_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_rint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_rint_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $4, %xmm0, %xmm0 +; SSE-NEXT: roundpd $4, %xmm1, %xmm1 +; SSE-NEXT: roundpd $4, %xmm2, %xmm2 +; SSE-NEXT: roundpd $4, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_rint_v8f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_rint_v8f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_rint_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_rint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_rint_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $4, %xmm0, %xmm0 +; SSE-NEXT: roundps $4, %xmm1, %xmm1 +; SSE-NEXT: roundps $4, %xmm2, %xmm2 +; SSE-NEXT: roundps $4, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_rint_v16f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_rint_v16f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_rint_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_rint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_rint_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $4, %xmm0, %xmm0 +; SSE-NEXT: roundpd $4, %xmm1, %xmm1 +; SSE-NEXT: roundpd $4, %xmm2, %xmm2 +; SSE-NEXT: roundpd $4, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rint_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rint_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_rint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_rint_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $4, %xmm0, %xmm0 +; SSE-NEXT: roundps $4, %xmm1, %xmm1 +; SSE-NEXT: roundps $4, %xmm2, %xmm2 +; SSE-NEXT: roundps $4, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rint_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rint_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-fround.ll b/llvm/test/CodeGen/X86/combine-fround.ll new file mode 100644 index 0000000..42dbaf2 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-fround.ll @@ -0,0 +1,419 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_round_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_round_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: andpd %xmm2, %xmm3 +; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1] +; SSE-NEXT: orpd %xmm4, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: roundpd $11, %xmm3, %xmm0 +; SSE-NEXT: andpd %xmm1, %xmm2 +; SSE-NEXT: orpd %xmm4, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: roundpd $11, %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_round_v4f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX1-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_round_v4f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX2-NEXT: vandpd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX2-NEXT: vorpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_round_v4f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & m64bcst) +; AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.round.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.round.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_round_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_round_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: andps %xmm2, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; SSE-NEXT: orps %xmm4, %xmm3 +; SSE-NEXT: addps %xmm0, %xmm3 +; SSE-NEXT: roundps $11, %xmm3, %xmm0 +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: addps %xmm1, %xmm2 +; SSE-NEXT: roundps $11, %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_round_v8f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_round_v8f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_round_v8f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 | (ymm0 & m32bcst) +; AVX512-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.round.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.round.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_round_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_round_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0] +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: andpd %xmm4, %xmm5 +; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1] +; SSE-NEXT: orpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm0, %xmm5 +; SSE-NEXT: roundpd $11, %xmm5, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm5 +; SSE-NEXT: andpd %xmm4, %xmm5 +; SSE-NEXT: orpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: roundpd $11, %xmm5, %xmm1 +; SSE-NEXT: movapd %xmm2, %xmm5 +; SSE-NEXT: andpd %xmm4, %xmm5 +; SSE-NEXT: orpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm2, %xmm5 +; SSE-NEXT: roundpd $11, %xmm5, %xmm2 +; SSE-NEXT: andpd %xmm3, %xmm4 +; SSE-NEXT: orpd %xmm6, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: roundpd $11, %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_round_v8f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm4 +; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX1-NEXT: vorpd %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_round_v8f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX2-NEXT: vandpd %ymm1, %ymm0, %ymm4 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX2-NEXT: vorpd %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vorpd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_round_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & m64bcst) +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.round.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.round.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.round.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.round.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_round_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_round_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: addps %xmm0, %xmm5 +; SSE-NEXT: roundps $11, %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: addps %xmm1, %xmm5 +; SSE-NEXT: roundps $11, %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: addps %xmm2, %xmm5 +; SSE-NEXT: roundps $11, %xmm5, %xmm2 +; SSE-NEXT: andps %xmm3, %xmm4 +; SSE-NEXT: orps %xmm6, %xmm4 +; SSE-NEXT: addps %xmm3, %xmm4 +; SSE-NEXT: roundps $11, %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_round_v16f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm4 +; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_round_v16f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm4 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX2-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_round_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & m32bcst) +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.round.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.round.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.round.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.round.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_round_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_round_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0] +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: andpd %xmm4, %xmm5 +; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1] +; SSE-NEXT: orpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm0, %xmm5 +; SSE-NEXT: roundpd $11, %xmm5, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm5 +; SSE-NEXT: andpd %xmm4, %xmm5 +; SSE-NEXT: orpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: roundpd $11, %xmm5, %xmm1 +; SSE-NEXT: movapd %xmm2, %xmm5 +; SSE-NEXT: andpd %xmm4, %xmm5 +; SSE-NEXT: orpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm2, %xmm5 +; SSE-NEXT: roundpd $11, %xmm5, %xmm2 +; SSE-NEXT: andpd %xmm3, %xmm4 +; SSE-NEXT: orpd %xmm6, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: roundpd $11, %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_round_v8f64_v4f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3 +; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vorpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_round_v8f64_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX2-NEXT: vandpd %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX2-NEXT: vorpd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX2-NEXT: vandpd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vorpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_round_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & m64bcst) +; AVX512-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.round.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.round.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_round_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_round_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: addps %xmm0, %xmm5 +; SSE-NEXT: roundps $11, %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: addps %xmm1, %xmm5 +; SSE-NEXT: roundps $11, %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: addps %xmm2, %xmm5 +; SSE-NEXT: roundps $11, %xmm5, %xmm2 +; SSE-NEXT: andps %xmm3, %xmm4 +; SSE-NEXT: orps %xmm6, %xmm4 +; SSE-NEXT: addps %xmm3, %xmm4 +; SSE-NEXT: roundps $11, %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_round_v16f32_v8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_round_v16f32_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX2-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vaddps %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_round_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 | (zmm0 & m32bcst) +; AVX512-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.round.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.round.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/combine-froundeven.ll b/llvm/test/CodeGen/X86/combine-froundeven.ll new file mode 100644 index 0000000..4bf1e86 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-froundeven.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_roundeven_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_roundeven_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $8, %xmm0, %xmm0 +; SSE-NEXT: roundpd $8, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_roundeven_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundpd $8, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_roundeven_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_roundeven_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $8, %xmm0, %xmm0 +; SSE-NEXT: roundps $8, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_roundeven_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundps $8, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_roundeven_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_roundeven_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $8, %xmm0, %xmm0 +; SSE-NEXT: roundpd $8, %xmm1, %xmm1 +; SSE-NEXT: roundpd $8, %xmm2, %xmm2 +; SSE-NEXT: roundpd $8, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_roundeven_v8f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $8, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundpd $8, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_roundeven_v8f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $8, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundpd $8, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_roundeven_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $8, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_roundeven_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_roundeven_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $8, %xmm0, %xmm0 +; SSE-NEXT: roundps $8, %xmm1, %xmm1 +; SSE-NEXT: roundps $8, %xmm2, %xmm2 +; SSE-NEXT: roundps $8, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_roundeven_v16f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $8, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundps $8, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_roundeven_v16f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $8, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundps $8, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_roundeven_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $8, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_roundeven_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_roundeven_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $8, %xmm0, %xmm0 +; SSE-NEXT: roundpd $8, %xmm1, %xmm1 +; SSE-NEXT: roundpd $8, %xmm2, %xmm2 +; SSE-NEXT: roundpd $8, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_roundeven_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $8, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $8, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundeven_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $8, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_roundeven_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_roundeven_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $8, %xmm0, %xmm0 +; SSE-NEXT: roundps $8, %xmm1, %xmm1 +; SSE-NEXT: roundps $8, %xmm2, %xmm2 +; SSE-NEXT: roundps $8, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_roundeven_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $8, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $8, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundeven_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $8, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-fsqrt.ll b/llvm/test/CodeGen/X86/combine-fsqrt.ll new file mode 100644 index 0000000..f30eac1 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-fsqrt.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_sqrt_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_sqrt_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: sqrtpd %xmm0, %xmm0 +; SSE-NEXT: sqrtpd %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_sqrt_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vsqrtpd %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_sqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_sqrt_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: sqrtps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_sqrt_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vsqrtps %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_sqrt_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_sqrt_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: sqrtpd %xmm0, %xmm0 +; SSE-NEXT: sqrtpd %xmm1, %xmm1 +; SSE-NEXT: sqrtpd %xmm2, %xmm2 +; SSE-NEXT: sqrtpd %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_sqrt_v8f64_v2f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vsqrtpd %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: vsqrtpd %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_sqrt_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vsqrtpd %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_sqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_sqrt_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: sqrtps %xmm1, %xmm1 +; SSE-NEXT: sqrtps %xmm2, %xmm2 +; SSE-NEXT: sqrtps %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_sqrt_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vsqrtps %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: vsqrtps %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_sqrt_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vsqrtps %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_sqrt_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_sqrt_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: sqrtpd %xmm0, %xmm0 +; SSE-NEXT: sqrtpd %xmm1, %xmm1 +; SSE-NEXT: sqrtpd %xmm2, %xmm2 +; SSE-NEXT: sqrtpd %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_sqrt_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vsqrtpd %ymm0, %ymm0 +; AVX1OR2-NEXT: vsqrtpd %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_sqrt_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vsqrtpd %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_sqrt_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: sqrtps %xmm1, %xmm1 +; SSE-NEXT: sqrtps %xmm2, %xmm2 +; SSE-NEXT: sqrtps %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_sqrt_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vsqrtps %ymm0, %ymm0 +; AVX1OR2-NEXT: vsqrtps %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_sqrt_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vsqrtps %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-ftrunc.ll b/llvm/test/CodeGen/X86/combine-ftrunc.ll new file mode 100644 index 0000000..3dde226 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-ftrunc.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_trunc_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; SSE-LABEL: concat_trunc_v4f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $11, %xmm0, %xmm0 +; SSE-NEXT: roundpd $11, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_trunc_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_trunc_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_trunc_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $11, %xmm0, %xmm0 +; SSE-NEXT: roundps $11, %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_trunc_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_trunc_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; SSE-LABEL: concat_trunc_v8f64_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $11, %xmm0, %xmm0 +; SSE-NEXT: roundpd $11, %xmm1, %xmm1 +; SSE-NEXT: roundpd $11, %xmm2, %xmm2 +; SSE-NEXT: roundpd $11, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_trunc_v8f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_trunc_v8f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0) + %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1) + %v2 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a2) + %v3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a3) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_trunc_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_trunc_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $11, %xmm0, %xmm0 +; SSE-NEXT: roundps $11, %xmm1, %xmm1 +; SSE-NEXT: roundps $11, %xmm2, %xmm2 +; SSE-NEXT: roundps $11, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_trunc_v16f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_trunc_v16f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0) + %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1) + %v2 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a2) + %v3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_trunc_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; SSE-LABEL: concat_trunc_v8f64_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: roundpd $11, %xmm0, %xmm0 +; SSE-NEXT: roundpd $11, %xmm1, %xmm1 +; SSE-NEXT: roundpd $11, %xmm2, %xmm2 +; SSE-NEXT: roundpd $11, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_trunc_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a0) + %v1 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a1) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_trunc_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; SSE-LABEL: concat_trunc_v16f32_v8f32: +; SSE: # %bb.0: +; SSE-NEXT: roundps $11, %xmm0, %xmm0 +; SSE-NEXT: roundps $11, %xmm1, %xmm1 +; SSE-NEXT: roundps $11, %xmm2, %xmm2 +; SSE-NEXT: roundps $11, %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_trunc_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_trunc_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a0) + %v1 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a1) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-icmp.ll b/llvm/test/CodeGen/X86/combine-icmp.ll new file mode 100644 index 0000000..dba5839 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-icmp.ll @@ -0,0 +1,905 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE,SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 + +define i4 @concat_icmp_v4i64_v2i64(<2 x i64> %a0, <2 x i64> %a1) { +; SSE2-LABEL: concat_icmp_v4i64_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE42-LABEL: concat_icmp_v4i64_v2i64: +; SSE42: # %bb.0: +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm2, %xmm1 +; SSE42-NEXT: packssdw %xmm1, %xmm0 +; SSE42-NEXT: movmskps %xmm0, %eax +; SSE42-NEXT: xorl $15, %eax +; SSE42-NEXT: # kill: def $al killed $al killed $eax +; SSE42-NEXT: retq +; +; AVX1OR2-LABEL: concat_icmp_v4i64_v2i64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovmskps %xmm0, %eax +; AVX1OR2-NEXT: xorl $15, %eax +; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v4i64_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0 +; AVX512-NEXT: vptestmq %xmm1, %xmm1, %k1 +; AVX512-NEXT: kshiftlb $2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq + %v0 = icmp ne <2 x i64> %a0, zeroinitializer + %v1 = icmp ne <2 x i64> %a1, zeroinitializer + %v = shufflevector <2 x i1> %v0, <2 x i1> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r = bitcast <4 x i1> %v to i4 + ret i4 %r +} + +define i8 @concat_icmp_v8i32_v4i32(<4 x i32> %a0, <4 x i32> %a1) { +; SSE-LABEL: concat_icmp_v8i32_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: def $al killed $al killed $eax +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_icmp_v8i32_v4i32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v8i32_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp eq <4 x i32> %a0, zeroinitializer + %v1 = icmp eq <4 x i32> %a1, zeroinitializer + %v = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r = bitcast <8 x i1> %v to i8 + ret i8 %r +} + +define i16 @concat_icmp_v16i16_v8i16(<8 x i16> %a0, <8 x i16> %a1) { +; SSE2-LABEL: concat_icmp_v16i16_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psubusw %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqw %xmm0, %xmm3 +; SSE2-NEXT: psubusw %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 +; SSE2-NEXT: packsswb %xmm2, %xmm3 +; SSE2-NEXT: pmovmskb %xmm3, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE42-LABEL: concat_icmp_v16i16_v8i16: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pmaxuw %xmm2, %xmm3 +; SSE42-NEXT: pcmpeqw %xmm0, %xmm3 +; SSE42-NEXT: pmaxuw %xmm1, %xmm2 +; SSE42-NEXT: pcmpeqw %xmm1, %xmm2 +; SSE42-NEXT: packsswb %xmm2, %xmm3 +; SSE42-NEXT: pmovmskb %xmm3, %eax +; SSE42-NEXT: # kill: def $ax killed $ax killed $eax +; SSE42-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v16i16_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v16i16_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpmaxuw %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxuw %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v16i16_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp ugt <8 x i16> %a0, splat (i16 1) + %v1 = icmp ugt <8 x i16> %a1, splat (i16 1) + %v = shufflevector <8 x i1> %v0, <8 x i1> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %r = bitcast <16 x i1> %v to i16 + ret i16 %r +} + +define i32 @concat_icmp_v32i8_v16i8(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: concat_icmp_v32i8_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; SSE-NEXT: pcmpgtb %xmm2, %xmm0 +; SSE-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v32i8_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v32i8_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v32i8_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp sgt <16 x i8> %a0, splat (i8 5) + %v1 = icmp sgt <16 x i8> %a1, splat (i8 5) + %v = shufflevector <16 x i1> %v0, <16 x i1> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %r = bitcast <32 x i1> %v to i32 + ret i32 %r +} + +define i8 @concat_icmp_v8i64_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> %a3) { +; SSE2-LABEL: concat_icmp_v8i64_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483776,2147483776,2147483776,2147483648] +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: packsswb %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE42-LABEL: concat_icmp_v8i64_v2i64: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: pxor %xmm4, %xmm0 +; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775936,9223372036854775936] +; SSE42-NEXT: movdqa %xmm5, %xmm6 +; SSE42-NEXT: pcmpgtq %xmm0, %xmm6 +; SSE42-NEXT: pxor %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; SSE42-NEXT: packssdw %xmm0, %xmm6 +; SSE42-NEXT: pxor %xmm4, %xmm2 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; SSE42-NEXT: pxor %xmm4, %xmm3 +; SSE42-NEXT: pcmpgtq %xmm3, %xmm5 +; SSE42-NEXT: packssdw %xmm5, %xmm0 +; SSE42-NEXT: packssdw %xmm6, %xmm6 +; SSE42-NEXT: packssdw %xmm0, %xmm0 +; SSE42-NEXT: packsswb %xmm0, %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,3] +; SSE42-NEXT: pmovmskb %xmm0, %eax +; SSE42-NEXT: # kill: def $al killed $al killed $eax +; SSE42-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v8i64_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775936,9223372036854775936] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v8i64_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854775936,9223372036854775936] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1 +; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v8i64_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp ult <2 x i64> %a0, splat (i64 128) + %v1 = icmp ult <2 x i64> %a1, splat (i64 128) + %v2 = icmp ult <2 x i64> %a2, splat (i64 128) + %v3 = icmp ult <2 x i64> %a3, splat (i64 128) + %v01 = shufflevector <2 x i1> %v0, <2 x i1> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %v23 = shufflevector <2 x i1> %v2, <2 x i1> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %v = shufflevector <4 x i1> %v01, <4 x i1> %v23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r = bitcast <8 x i1> %v to i8 + ret i8 %r +} + +define i16 @concat_icmp_v16i32_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { +; SSE-LABEL: concat_icmp_v16i32_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_icmp_v16i32_v4i32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpgtd %xmm4, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpgtd %xmm4, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpcmpgtd %xmm4, %xmm3, %xmm2 +; AVX1OR2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v16i32_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp sgt <4 x i32> %a0, zeroinitializer + %v1 = icmp sgt <4 x i32> %a1, zeroinitializer + %v2 = icmp sgt <4 x i32> %a2, zeroinitializer + %v3 = icmp sgt <4 x i32> %a3, zeroinitializer + %v01 = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v23 = shufflevector <4 x i1> %v2, <4 x i1> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v = shufflevector <8 x i1> %v01, <8 x i1> %v23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %r = bitcast <16 x i1> %v to i16 + ret i16 %r +} + +define i32 @concat_icmp_v32i16_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { +; SSE-LABEL: concat_icmp_v32i16_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE-NEXT: pcmpeqw %xmm4, %xmm1 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm4, %xmm2 +; SSE-NEXT: pcmpeqw %xmm4, %xmm3 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v32i16_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: xorl $65535, %ecx # imm = 0xFFFF +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v32i16_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpacksswb %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v32i16_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vptestmw %zmm0, %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp ne <8 x i16> %a0, zeroinitializer + %v1 = icmp ne <8 x i16> %a1, zeroinitializer + %v2 = icmp ne <8 x i16> %a2, zeroinitializer + %v3 = icmp ne <8 x i16> %a3, zeroinitializer + %v01 = shufflevector <8 x i1> %v0, <8 x i1> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %v23 = shufflevector <8 x i1> %v2, <8 x i1> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %v = shufflevector <16 x i1> %v01, <16 x i1> %v23, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %r = bitcast <32 x i1> %v to i32 + ret i32 %r +} + +define i64 @concat_icmp_v64i8_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> %a3) { +; SSE-LABEL: concat_icmp_v64i8_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pmaxub %xmm4, %xmm5 +; SSE-NEXT: pcmpeqb %xmm0, %xmm5 +; SSE-NEXT: pmovmskb %xmm5, %eax +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pmaxub %xmm4, %xmm0 +; SSE-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: shll $16, %ecx +; SSE-NEXT: orl %eax, %ecx +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pmaxub %xmm4, %xmm0 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %edx +; SSE-NEXT: pmaxub %xmm3, %xmm4 +; SSE-NEXT: pcmpeqb %xmm3, %xmm4 +; SSE-NEXT: pmovmskb %xmm4, %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: orl %edx, %eax +; SSE-NEXT: shlq $32, %rax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v64i8_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpmaxub %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxub %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxub %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxub %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpmovmskb %xmm1, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: vpmovmskb %xmm2, %edx +; AVX1-NEXT: vpmovmskb %xmm3, %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %edx, %eax +; AVX1-NEXT: shlq $32, %rax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v64i8_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpmaxub %xmm4, %xmm0, %xmm5 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %xmm4, %xmm1, %xmm5 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxub %xmm4, %xmm2, %xmm5 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpmaxub %xmm4, %xmm3, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: shlq $32, %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v64i8_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: kmovq %k0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp ugt <16 x i8> %a0, splat (i8 15) + %v1 = icmp ugt <16 x i8> %a1, splat (i8 15) + %v2 = icmp ugt <16 x i8> %a2, splat (i8 15) + %v3 = icmp ugt <16 x i8> %a3, splat (i8 15) + %v01 = shufflevector <16 x i1> %v0, <16 x i1> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %v23 = shufflevector <16 x i1> %v2, <16 x i1> %v3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %v = shufflevector <32 x i1> %v01, <32 x i1> %v23, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %r = bitcast <64 x i1> %v to i64 + ret i64 %r +} + +define i8 @concat_icmp_v8i64_v4i64(<4 x i64> %a0, <4 x i64> %a1) { +; SSE2-LABEL: concat_icmp_v8i64_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,3,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,2] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: andps %xmm1, %xmm2 +; SSE2-NEXT: packssdw %xmm2, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSE42-LABEL: concat_icmp_v8i64_v4i64: +; SSE42: # %bb.0: +; SSE42-NEXT: pxor %xmm4, %xmm4 +; SSE42-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE42-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: pcmpeqq %xmm4, %xmm3 +; SSE42-NEXT: pcmpeqq %xmm4, %xmm2 +; SSE42-NEXT: packssdw %xmm3, %xmm2 +; SSE42-NEXT: packssdw %xmm2, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE42-NEXT: packsswb %xmm0, %xmm0 +; SSE42-NEXT: pmovmskb %xmm0, %eax +; SSE42-NEXT: # kill: def $al killed $al killed $eax +; SSE42-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v8i64_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v8i64_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v8i64_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp eq <4 x i64> %a0, zeroinitializer + %v1 = icmp eq <4 x i64> %a1, zeroinitializer + %v = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r = bitcast <8 x i1> %v to i8 + ret i8 %r +} + +define i16 @concat_icmp_v16i32_v8i32(<8 x i32> %a0, <8 x i32> %a1) { +; SSE2-LABEL: concat_icmp_v16i32_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483649,2147483649,2147483649,2147483649] +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packsswb %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: retq +; +; SSE42-LABEL: concat_icmp_v16i32_v8i32: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2] +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: pmaxud %xmm4, %xmm5 +; SSE42-NEXT: pcmpeqd %xmm1, %xmm5 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pmaxud %xmm4, %xmm1 +; SSE42-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE42-NEXT: packssdw %xmm5, %xmm1 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: pmaxud %xmm4, %xmm0 +; SSE42-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE42-NEXT: pmaxud %xmm2, %xmm4 +; SSE42-NEXT: pcmpeqd %xmm2, %xmm4 +; SSE42-NEXT: packssdw %xmm0, %xmm4 +; SSE42-NEXT: packsswb %xmm4, %xmm1 +; SSE42-NEXT: pmovmskb %xmm1, %eax +; SSE42-NEXT: # kill: def $ax killed $ax killed $eax +; SSE42-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v16i32_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,2,2,2] +; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxud %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v16i32_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v16i32_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp ugt <8 x i32> %a0, splat (i32 1) + %v1 = icmp ugt <8 x i32> %a1, splat (i32 1) + %v = shufflevector <8 x i1> %v0, <8 x i1> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %r = bitcast <16 x i1> %v to i16 + ret i16 %r +} + +define i32 @concat_icmp_v32i16_v16i16(<16 x i16> %a0, <16 x i16> %a1) { +; SSE-LABEL: concat_icmp_v32i16_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [5,5,5,5,5,5,5,5] +; SSE-NEXT: pcmpgtw %xmm4, %xmm1 +; SSE-NEXT: pcmpgtw %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pcmpgtw %xmm4, %xmm3 +; SSE-NEXT: pcmpgtw %xmm4, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v32i16_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,5,5,5,5,5,5,5] +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: vpmovmskb %xmm1, %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v32i16_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v32i16_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp sgt <16 x i16> %a0, splat (i16 5) + %v1 = icmp sgt <16 x i16> %a1, splat (i16 5) + %v = shufflevector <16 x i1> %v0, <16 x i1> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %r = bitcast <32 x i1> %v to i32 + ret i32 %r +} + +define i64 @concat_icmp_v64i8_v32i8(<32 x i8> %a0, <32 x i8> %a1) { +; SSE-LABEL: concat_icmp_v64i8_v32i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pcmpgtb %xmm0, %xmm5 +; SSE-NEXT: pmovmskb %xmm5, %eax +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: shll $16, %ecx +; SSE-NEXT: orl %eax, %ecx +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pcmpgtb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %edx +; SSE-NEXT: pcmpgtb %xmm3, %xmm4 +; SSE-NEXT: pmovmskb %xmm4, %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: orl %edx, %eax +; SSE-NEXT: shlq $32, %rax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: concat_icmp_v64i8_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpmovmskb %xmm3, %eax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %ecx +; AVX1-NEXT: shll $16, %ecx +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %edx +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: shll $16, %eax +; AVX1-NEXT: orl %edx, %eax +; AVX1-NEXT: shlq $32, %rax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_icmp_v64i8_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: shlq $32, %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_icmp_v64i8_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpcmpltb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0 +; AVX512-NEXT: kmovq %k0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %v0 = icmp slt <32 x i8> %a0, splat (i8 1) + %v1 = icmp slt <32 x i8> %a1, splat (i8 1) + %v = shufflevector <32 x i1> %v0, <32 x i1> %v1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %r = bitcast <64 x i1> %v to i64 + ret i64 %r +} diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index 29c41ca..15d187a 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -504,7 +504,7 @@ define <16 x i8> @PR35579(<16 x i8> %x) { ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; SSE-NEXT: psllw $8, %xmm1 -; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1] ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-rcp.ll b/llvm/test/CodeGen/X86/combine-rcp.ll new file mode 100644 index 0000000..4647516 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-rcp.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <8 x float> @concat_rcp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_rcp_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: rcpps %xmm0, %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_rcp_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vrcpps %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) + %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +; Ensure we don't convert rcpps to rcp14ps +define <16 x float> @concat_rcp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_rcp_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: rcpps %xmm0, %xmm0 +; SSE-NEXT: rcpps %xmm1, %xmm1 +; SSE-NEXT: rcpps %xmm2, %xmm2 +; SSE-NEXT: rcpps %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rcp_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vrcpps %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: vrcpps %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rcp_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vrcpps %ymm0, %ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX512-NEXT: vrcpps %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) + %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1) + %v2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a2) + %v3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-rndscale.ll b/llvm/test/CodeGen/X86/combine-rndscale.ll new file mode 100644 index 0000000..b557dd8 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-rndscale.ll @@ -0,0 +1,162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <4 x double> @concat_roundpd_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { +; AVX-LABEL: concat_roundpd_v4f64_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4) + %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4) + %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x double> %res +} + +define <8 x float> @concat_roundps_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; AVX-LABEL: concat_roundps_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4) + %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +define <8 x double> @concat_roundpd_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { +; AVX1-LABEL: concat_roundpd_v8f64_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_roundpd_v8f64_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_roundpd_v8f64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4) + %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4) + %v2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a2, i32 4) + %v3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a3, i32 4) + %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_roundps_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; AVX1-LABEL: concat_roundps_v16f32_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_roundps_v16f32_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: concat_roundps_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4) + %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4) + %v2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a2, i32 4) + %v3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a3, i32 4) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +define <8 x double> @concat_roundpd_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) { +; AVX1OR2-LABEL: concat_roundpd_v8f64_v4f64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundpd $4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundpd $4, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundpd_v8f64_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4) + %v1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a1, i32 4) + %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %res +} + +define <16 x float> @concat_roundps_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) { +; AVX1OR2-LABEL: concat_roundps_v16f32_v8f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vroundps $4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vroundps $4, %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_roundps_v16f32_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4) + %v1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a1, i32 4) + %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} + +; negative test - rounding mode mismatch +define <8 x float> @concat_roundps_v8f32_v4f32_mismatch(<4 x float> %a0, <4 x float> %a1) { +; AVX-LABEL: concat_roundps_v8f32_v4f32_mismatch: +; AVX: # %bb.0: +; AVX-NEXT: vroundps $0, %xmm0, %xmm0 +; AVX-NEXT: vroundps $4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 0) + %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-rsqrt.ll b/llvm/test/CodeGen/X86/combine-rsqrt.ll new file mode 100644 index 0000000..b373458 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-rsqrt.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 + +define <8 x float> @concat_rsqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: concat_rsqrt_v8f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtps %xmm0, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: concat_rsqrt_v8f32_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vrsqrtps %ymm0, %ymm0 +; AVX-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) + %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1) + %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %res +} + +; Ensure we don't convert rsqrtps to rsqrt14ps +define <16 x float> @concat_rsqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { +; SSE-LABEL: concat_rsqrt_v16f32_v4f32: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtps %xmm0, %xmm0 +; SSE-NEXT: rsqrtps %xmm1, %xmm1 +; SSE-NEXT: rsqrtps %xmm2, %xmm2 +; SSE-NEXT: rsqrtps %xmm3, %xmm3 +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: concat_rsqrt_v16f32_v4f32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vrsqrtps %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1OR2-NEXT: vrsqrtps %ymm1, %ymm1 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: concat_rsqrt_v16f32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vrsqrtps %ymm0, %ymm0 +; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX512-NEXT: vrsqrtps %ymm1, %ymm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) + %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1) + %v2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a2) + %v3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a3) + %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x float> %res +} diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll index 36e374b..e601c57 100644 --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -112,6 +112,69 @@ define <8 x i16> @combine_zero_v8i16(<8 x i16> %a0) { ret <8 x i16> %1 } +; fold (usub_sat x, 1) -> sub(x, zext(x != 0)) +define i32 @combine_dec_i32(i32 %a0) { +; CHECK-LABEL: combine_dec_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: subl $1, %edi +; CHECK-NEXT: cmovael %edi, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.usub.sat.i32(i32 %a0, i32 1) + ret i32 %1 +} + +; fold (usub_sat x, 1) -> add(x, sext(x != 0)) +define <4 x i32> @combine_dec_v4i32(<4 x i32> %a0) { +; SSE2-LABEL: combine_dec_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: combine_dec_v4i32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE42-LABEL: combine_dec_v4i32: +; SSE42: # %bb.0: +; SSE42-NEXT: pmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE42-NEXT: paddd %xmm1, %xmm0 +; SSE42-NEXT: retq +; +; AVX1-LABEL: combine_dec_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_dec_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: combine_dec_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a0, <4 x i32> splat (i32 1)) + ret <4 x i32> %1 +} + ; fold (usub_sat x, x) -> 0 define i32 @combine_self_i32(i32 %a0) { ; CHECK-LABEL: combine_self_i32: diff --git a/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll b/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll index 47331db..b19112c 100644 --- a/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll +++ b/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f,avx512vl | FileCheck %s --check-prefix=CHECK -define void @test_compress_undef_float_passthrough() { +define void @test_compress_undef_float_passthrough(<4 x double> %a0) { ; CHECK-LABEL: test_compress_undef_float_passthrough: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movb $5, %al @@ -12,7 +12,7 @@ define void @test_compress_undef_float_passthrough() { ; CHECK-NEXT: retq entry: ; preds = %loop.50 %0 = bitcast i4 undef to <4 x i1> - %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> <i1 1, i1 0, i1 1, i1 0>) + %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %a0, <4 x double> undef, <4 x i1> <i1 1, i1 0, i1 1, i1 0>) call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %1, <4 x ptr> undef, i32 0, <4 x i1> %0) ret void } diff --git a/llvm/test/CodeGen/X86/dag-combine-counter.ll b/llvm/test/CodeGen/X86/dag-combine-counter.ll index 4cc3c71b..9b56586 100644 --- a/llvm/test/CodeGen/X86/dag-combine-counter.ll +++ b/llvm/test/CodeGen/X86/dag-combine-counter.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=x86_64-- -debug-counter=dagcombine=0-5 < %s | FileCheck %s -; REQUIRES: asserts - define i32 @test(i32 %x) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll deleted file mode 100644 index 6bbf3eb..0000000 --- a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: llc -x86-discriminate-memops < %s | FileCheck %s -; -; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling: -; int sum(int* arr, int pos1, int pos2) { -; return arr[pos1] + arr[pos2]; -; } -; -; ModuleID = 'test.cc' -source_filename = "test.cc" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -declare void @llvm.prefetch(ptr, i32, i32, i32) -; Function Attrs: norecurse nounwind readonly uwtable -define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 { -entry: - %idxprom = sext i32 %pos1 to i64 - %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom - %0 = load i32, ptr %arrayidx, align 4 - %idxprom1 = sext i32 %pos2 to i64 - %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1 - %1 = load i32, ptr %arrayidx2, align 4 - %add = add nsw i32 %1, %0, !dbg !15 - ret i32 %add -} - -attributes #0 = { "target-cpu"="x86-64" } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true) -!1 = !DIFile(filename: "test.cc", directory: "/tmp") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"} -!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0) -!8 = !DISubroutineType(types: !2) -!9 = !DILocation(line: 2, column: 10, scope: !7) -!10 = !{!11, !11, i64 0} -!11 = !{!"int", !12, i64 0} -!12 = !{!"omnipotent char", !13, i64 0} -!13 = !{!"Simple C++ TBAA"} -!15 = !DILocation(line: 2, column: 20, scope: !7) - - -;CHECK-LABEL: sum: -;CHECK: # %bb.0: -;CHECK: .loc 1 1 0 {{.*}} discriminator 2 -;CHECK-NEXT: movl (%rdi,%rax,4), %eax -;CHECK-NEXT: .loc 1 2 20 -;CHECK-NEXT: addl (%rdi,%rcx,4), %eax diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll deleted file mode 100644 index ca412c5..0000000 --- a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc -x86-discriminate-memops < %s | FileCheck %s -; RUN: llc -x86-discriminate-memops -x86-bypass-prefetch-instructions=0 < %s | FileCheck %s -check-prefix=NOBYPASS -; -; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling: -; int sum(int* arr, int pos1, int pos2) { -; return arr[pos1] + arr[pos2]; -; } -; -; ModuleID = 'test.cc' -source_filename = "test.cc" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -declare void @llvm.prefetch(ptr, i32, i32, i32) -; Function Attrs: norecurse nounwind readonly uwtable -define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 { -entry: - %idxprom = sext i32 %pos1 to i64, !dbg !9 - %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9 - %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10 - %idxprom1 = sext i32 %pos2 to i64, !dbg !14 - %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14 - call void @llvm.prefetch(ptr %arrayidx2, i32 0, i32 3, i32 1) - %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10 - %add = add nsw i32 %1, %0, !dbg !15 - ret i32 %add, !dbg !16 -} - -attributes #0 = { "target-cpu"="x86-64" } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true) -!1 = !DIFile(filename: "test.cc", directory: "/tmp") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"} -!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0) -!8 = !DISubroutineType(types: !2) -!9 = !DILocation(line: 2, column: 10, scope: !7) -!10 = !{!11, !11, i64 0} -!11 = !{!"int", !12, i64 0} -!12 = !{!"omnipotent char", !13, i64 0} -!13 = !{!"Simple C++ TBAA"} -!14 = !DILocation(line: 2, column: 22, scope: !7) -!15 = !DILocation(line: 2, column: 20, scope: !7) -!16 = !DILocation(line: 2, column: 3, scope: !7) - -;CHECK-LABEL: sum: -;CHECK: # %bb.0: -;CHECK: prefetcht0 (%rdi,%rax,4) -;CHECK-NEXT: movl (%rdi,%rax,4), %eax -;CHECK-NEXT: .loc 1 2 20 discriminator 2 # test.cc:2:20 -;CHECK-NEXT: addl (%rdi,%rcx,4), %eax -;CHECK-NEXT: .loc 1 2 3 # test.cc:2:3 - -;NOBYPASS-LABEL: sum: -;NOBYPASS: # %bb.0: -;NOBYPASS: prefetcht0 (%rdi,%rax,4) -;NOBYPASS-NEXT: .loc 1 2 22 -;NOBYPASS-NEXT: movl (%rdi,%rax,4), %eax -;NOBYPASS-NEXT: .loc 1 2 20 {{.*}} discriminator 2 # test.cc:2:20 -;NOBYPASS-NEXT: addl (%rdi,%rcx,4), %eax -;NOBYPASS-NEXT: .loc 1 2 3 # test.cc:2:3 diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops.ll deleted file mode 100644 index a8421d9..0000000 --- a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: llc -x86-discriminate-memops < %s | FileCheck %s -; -; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling: -; int sum(int* arr, int pos1, int pos2) { -; return arr[pos1] + arr[pos2]; -; } -; -; ModuleID = 'test.cc' -source_filename = "test.cc" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: norecurse nounwind readonly uwtable -define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 { -entry: - %idxprom = sext i32 %pos1 to i64, !dbg !9 - %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9 - %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10 - %idxprom1 = sext i32 %pos2 to i64, !dbg !14 - %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14 - %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10 - %add = add nsw i32 %1, %0, !dbg !15 - ret i32 %add, !dbg !16 -} - -attributes #0 = { "target-cpu"="x86-64" } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true) -!1 = !DIFile(filename: "test.cc", directory: "/tmp") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"} -!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0) -!8 = !DISubroutineType(types: !2) -!9 = !DILocation(line: 2, column: 10, scope: !7) -!10 = !{!11, !11, i64 0} -!11 = !{!"int", !12, i64 0} -!12 = !{!"omnipotent char", !13, i64 0} -!13 = !{!"Simple C++ TBAA"} -!14 = !DILocation(line: 2, column: 22, scope: !7) -!15 = !DILocation(line: 2, column: 20, scope: !7) -!16 = !DILocation(line: 2, column: 3, scope: !7) - -;CHECK-LABEL: sum: -;CHECK: # %bb.0: -;CHECK: movl (%rdi,%rax,4), %eax -;CHECK-NEXT: .loc 1 2 20 discriminator 2 # test.cc:2:20 -;CHECK-NEXT: addl (%rdi,%rcx,4), %eax -;CHECK-NEXT: .loc 1 2 3 # test.cc:2:3 diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll index 3243d95..e2400fb 100644 --- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll +++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll @@ -106,7 +106,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) { ; AVX512: # %bb.0: ; AVX512-NEXT: subq $24, %rsp ; AVX512-NEXT: .cfi_def_cfa_offset 32 -; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: callq use.v4.i32@PLT ; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll index 150bef0..6a03628 100644 --- a/llvm/test/CodeGen/X86/fmaxnum.ll +++ b/llvm/test/CodeGen/X86/fmaxnum.ll @@ -676,15 +676,44 @@ define float @test_maxnum_neg_inf_nnan(float %x, float %y) nounwind { ; Test SNaN quieting define float @test_maxnum_snan(float %x) { -; SSE-LABEL: test_maxnum_snan: -; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE-NEXT: retq +; SSE2-LABEL: test_maxnum_snan: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_maxnum_snan: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; AVX-NEXT: retq +; SSE4-LABEL: test_maxnum_snan: +; SSE4: # %bb.0: +; SSE4-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE4-NEXT: maxss %xmm0, %xmm1 +; SSE4-NEXT: cmpunordss %xmm0, %xmm0 +; SSE4-NEXT: blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: movaps %xmm1, %xmm0 +; SSE4-NEXT: retq +; +; AVX1-LABEL: test_maxnum_snan: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_maxnum_snan: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq %r = call float @llvm.maxnum.f32(float 0x7ff4000000000000, float %x) ret float %r } diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll index aae6cda..e0dea64 100644 --- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -8,8 +8,10 @@ declare float @llvm.maximumnum.f32(float, float) declare double @llvm.maximumnum.f64(double, double) +declare fp128 @llvm.maximumnum.f128(fp128, fp128) declare float @llvm.minimumnum.f32(float, float) declare double @llvm.minimumnum.f64(double, double) +declare fp128 @llvm.minimumnum.f128(fp128, fp128) declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>) declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>) declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>) @@ -2569,3 +2571,383 @@ define float @test_fminimumnum_snan(float %x, float %y) { %1 = tail call float @llvm.minimumnum.f32(float 0x7ff4000000000000, float %y) ret float %1 } + +define fp128 @test_fmaximumnum_fp128(fp128 %x, fp128 %y) nounwind { +; SSE2-LABEL: test_fmaximumnum_fp128: +; SSE2: # %bb.0: # %start +; SSE2-NEXT: subq $40, %rsp +; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: callq __unordtf2@PLT +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: jne .LBB39_2 +; SSE2-NEXT: # %bb.1: # %start +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: .LBB39_2: # %start +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: callq __unordtf2@PLT +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: jne .LBB39_4 +; SSE2-NEXT: # %bb.3: # %start +; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: .LBB39_4: # %start +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: callq __gttf2@PLT +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: jg .LBB39_6 +; SSE2-NEXT: # %bb.5: # %start +; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: .LBB39_6: # %start +; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: callq __trunctfsf2@PLT +; SSE2-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: je .LBB39_8 +; SSE2-NEXT: # %bb.7: # %start +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: .LBB39_8: # %start +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: callq __eqtf2@PLT +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: je .LBB39_10 +; SSE2-NEXT: # %bb.9: # %start +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: .LBB39_10: # %start +; SSE2-NEXT: addq $40, %rsp +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fmaximumnum_fp128: +; AVX: # %bb.0: # %start +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: callq __unordtf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: jne .LBB39_2 +; AVX-NEXT: # %bb.1: # %start +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: .LBB39_2: # %start +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: callq __unordtf2@PLT +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: jne .LBB39_4 +; AVX-NEXT: # %bb.3: # %start +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: .LBB39_4: # %start +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: callq __gttf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: vmovdqa %xmm0, %xmm1 +; AVX-NEXT: jg .LBB39_6 +; AVX-NEXT: # %bb.5: # %start +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: .LBB39_6: # %start +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: callq __trunctfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: je .LBB39_8 +; AVX-NEXT: # %bb.7: # %start +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .LBB39_8: # %start +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: callq __eqtf2@PLT +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: je .LBB39_10 +; AVX-NEXT: # %bb.9: # %start +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: .LBB39_10: # %start +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fmaximumnum_fp128: +; AVX10_2: # %bb.0: # %start +; AVX10_2-NEXT: subq $40, %rsp +; AVX10_2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX10_2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX10_2-NEXT: vmovaps %xmm0, %xmm1 +; AVX10_2-NEXT: callq __unordtf2@PLT +; AVX10_2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: vmovaps %xmm0, %xmm1 +; AVX10_2-NEXT: jne .LBB39_2 +; AVX10_2-NEXT: # %bb.1: # %start +; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX10_2-NEXT: .LBB39_2: # %start +; AVX10_2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX10_2-NEXT: vmovaps %xmm0, %xmm1 +; AVX10_2-NEXT: callq __unordtf2@PLT +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX10_2-NEXT: jne .LBB39_4 +; AVX10_2-NEXT: # %bb.3: # %start +; AVX10_2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX10_2-NEXT: .LBB39_4: # %start +; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX10_2-NEXT: callq __gttf2@PLT +; AVX10_2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: vmovdqa %xmm0, %xmm1 +; AVX10_2-NEXT: jg .LBB39_6 +; AVX10_2-NEXT: # %bb.5: # %start +; AVX10_2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX10_2-NEXT: .LBB39_6: # %start +; AVX10_2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX10_2-NEXT: callq __trunctfsf2@PLT +; AVX10_2-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX10_2-NEXT: vmovd %xmm0, %eax +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: je .LBB39_8 +; AVX10_2-NEXT: # %bb.7: # %start +; AVX10_2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX10_2-NEXT: .LBB39_8: # %start +; AVX10_2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vmovaps %xmm2, %xmm0 +; AVX10_2-NEXT: callq __eqtf2@PLT +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: je .LBB39_10 +; AVX10_2-NEXT: # %bb.9: # %start +; AVX10_2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: .LBB39_10: # %start +; AVX10_2-NEXT: addq $40, %rsp +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fmaximumnum_fp128: +; X86: # %bb.0: # %start +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: vmovups 24(%ebp), %ymm0 +; X86-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: vzeroupper +; X86-NEXT: calll fmaximum_numl +; X86-NEXT: subl $4, %esp +; X86-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovaps %xmm0, (%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +start: + %0 = tail call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y) + ret fp128 %0 +} + +define fp128 @test_fminimumnum_fp128(fp128 %x, fp128 %y) nounwind { +; SSE2-LABEL: test_fminimumnum_fp128: +; SSE2: # %bb.0: # %start +; SSE2-NEXT: subq $40, %rsp +; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: callq __unordtf2@PLT +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: jne .LBB40_2 +; SSE2-NEXT: # %bb.1: # %start +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: .LBB40_2: # %start +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: callq __unordtf2@PLT +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: jne .LBB40_4 +; SSE2-NEXT: # %bb.3: # %start +; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: .LBB40_4: # %start +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: callq __lttf2@PLT +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: js .LBB40_6 +; SSE2-NEXT: # %bb.5: # %start +; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: .LBB40_6: # %start +; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE2-NEXT: callq __trunctfsf2@PLT +; SSE2-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: jo .LBB40_8 +; SSE2-NEXT: # %bb.7: # %start +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: .LBB40_8: # %start +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: callq __eqtf2@PLT +; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: je .LBB40_10 +; SSE2-NEXT: # %bb.9: # %start +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: .LBB40_10: # %start +; SSE2-NEXT: addq $40, %rsp +; SSE2-NEXT: retq +; +; AVX-LABEL: test_fminimumnum_fp128: +; AVX: # %bb.0: # %start +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: callq __unordtf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: jne .LBB40_2 +; AVX-NEXT: # %bb.1: # %start +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: .LBB40_2: # %start +; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, %xmm1 +; AVX-NEXT: callq __unordtf2@PLT +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: jne .LBB40_4 +; AVX-NEXT: # %bb.3: # %start +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: .LBB40_4: # %start +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: callq __lttf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: vmovdqa %xmm0, %xmm1 +; AVX-NEXT: js .LBB40_6 +; AVX-NEXT: # %bb.5: # %start +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: .LBB40_6: # %start +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX-NEXT: callq __trunctfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: jo .LBB40_8 +; AVX-NEXT: # %bb.7: # %start +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .LBB40_8: # %start +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: callq __eqtf2@PLT +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: je .LBB40_10 +; AVX-NEXT: # %bb.9: # %start +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: .LBB40_10: # %start +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq +; +; AVX10_2-LABEL: test_fminimumnum_fp128: +; AVX10_2: # %bb.0: # %start +; AVX10_2-NEXT: subq $40, %rsp +; AVX10_2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX10_2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX10_2-NEXT: vmovaps %xmm0, %xmm1 +; AVX10_2-NEXT: callq __unordtf2@PLT +; AVX10_2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: vmovaps %xmm0, %xmm1 +; AVX10_2-NEXT: jne .LBB40_2 +; AVX10_2-NEXT: # %bb.1: # %start +; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX10_2-NEXT: .LBB40_2: # %start +; AVX10_2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX10_2-NEXT: vmovaps %xmm0, %xmm1 +; AVX10_2-NEXT: callq __unordtf2@PLT +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX10_2-NEXT: jne .LBB40_4 +; AVX10_2-NEXT: # %bb.3: # %start +; AVX10_2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX10_2-NEXT: .LBB40_4: # %start +; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX10_2-NEXT: callq __lttf2@PLT +; AVX10_2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: vmovdqa %xmm0, %xmm1 +; AVX10_2-NEXT: js .LBB40_6 +; AVX10_2-NEXT: # %bb.5: # %start +; AVX10_2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX10_2-NEXT: .LBB40_6: # %start +; AVX10_2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX10_2-NEXT: callq __trunctfsf2@PLT +; AVX10_2-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX10_2-NEXT: vmovd %xmm0, %eax +; AVX10_2-NEXT: negl %eax +; AVX10_2-NEXT: jo .LBB40_8 +; AVX10_2-NEXT: # %bb.7: # %start +; AVX10_2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX10_2-NEXT: .LBB40_8: # %start +; AVX10_2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX10_2-NEXT: vmovaps %xmm2, %xmm0 +; AVX10_2-NEXT: callq __eqtf2@PLT +; AVX10_2-NEXT: testl %eax, %eax +; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: je .LBB40_10 +; AVX10_2-NEXT: # %bb.9: # %start +; AVX10_2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX10_2-NEXT: .LBB40_10: # %start +; AVX10_2-NEXT: addq $40, %rsp +; AVX10_2-NEXT: retq +; +; X86-LABEL: test_fminimumnum_fp128: +; X86: # %bb.0: # %start +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: vmovups 24(%ebp), %ymm0 +; X86-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: vzeroupper +; X86-NEXT: calll fminimum_numl +; X86-NEXT: subl $4, %esp +; X86-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovaps %xmm0, (%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +start: + %0 = tail call fp128 @llvm.minimumnum.f128(fp128 %x, fp128 %y) + ret fp128 %0 +} diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll index 4aa1a61..5c882c9 100644 --- a/llvm/test/CodeGen/X86/fminnum.ll +++ b/llvm/test/CodeGen/X86/fminnum.ll @@ -676,15 +676,44 @@ define float @test_minnum_inf_nnan(float %x, float %y) nounwind { ; Test SNaN quieting define float @test_minnum_snan(float %x) { -; SSE-LABEL: test_minnum_snan: -; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE-NEXT: retq +; SSE2-LABEL: test_minnum_snan: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: cmpunordss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_minnum_snan: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; AVX-NEXT: retq +; SSE4-LABEL: test_minnum_snan: +; SSE4: # %bb.0: +; SSE4-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE4-NEXT: minss %xmm0, %xmm1 +; SSE4-NEXT: cmpunordss %xmm0, %xmm0 +; SSE4-NEXT: blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: movaps %xmm1, %xmm0 +; SSE4-NEXT: retq +; +; AVX1-LABEL: test_minnum_snan: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; AVX1-NEXT: vminss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_minnum_snan: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq %r = call float @llvm.minnum.f32(float 0x7ff4000000000000, float %x) ret float %r } diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 81529af..b655bda 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -79,38 +79,54 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: subq $40, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48 -; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vmovd %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi -; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX-NEXT: callq ldexpf@PLT -; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; CHECK-AVX-NEXT: addq $40, %rsp -; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: subq $40, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48 +; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vmovd %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi +; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-AVX2-NEXT: callq ldexpf@PLT +; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-AVX2-NEXT: addq $40, %rsp +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX2-NEXT: retq +; +; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] +; CHECK-ONLY-AVX512F-NEXT: vmovaps %xmm0, %xmm0 +; CHECK-ONLY-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0 +; CHECK-ONLY-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-ONLY-AVX512F-NEXT: vzeroupper +; CHECK-ONLY-AVX512F-NEXT: retq +; +; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0] +; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i) ret <4 x float> %r } @@ -562,79 +578,11 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { ; ; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf: ; CHECK-AVX512F: # %bb.0: -; CHECK-AVX512F-NEXT: subq $72, %rsp -; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80 -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax -; CHECK-AVX512F-NEXT: movswl %ax, %edi -; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] -; CHECK-AVX512F-NEXT: callq ldexpf@PLT -; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-AVX512F-NEXT: addq $72, %rsp -; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3] +; CHECK-AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; CHECK-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0 +; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; CHECK-AVX512F-NEXT: vzeroupper ; CHECK-AVX512F-NEXT: retq %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i) ret <8 x half> %r @@ -1141,8 +1089,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: subq $56, %rsp +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] ; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax @@ -1171,8 +1119,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; ; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: ; CHECK-ONLY-AVX512F: # %bb.0: +; CHECK-ONLY-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] ; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-ONLY-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0] ; CHECK-ONLY-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-ONLY-AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll index c1beb7c..c9c88f7 100644 --- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll +++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll @@ -1031,31 +1031,30 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind { ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: je .LBB17_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $1, %r15d +; CHECK-NEXT: movl $1, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB17_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB17_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB17_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1199,32 +1198,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_ ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB21_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB21_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB21_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB21_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1251,32 +1249,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB22_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB22_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB22_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB22_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1303,31 +1300,30 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem( ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB23_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB23_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB23_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB23_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1404,32 +1400,31 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3 ; CHECK-NEXT: cmpl %edx, %edi ; CHECK-NEXT: jbe .LBB25_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movl %edx, %r15d -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edx, %ebx +; CHECK-NEXT: movl %esi, %ebp ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: addl $-2, %r15d +; CHECK-NEXT: addl $-2, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB25_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ebx +; CHECK-NEXT: divl %ebp ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $-2, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB25_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB25_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index e223765..46b2571 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -490,20 +490,21 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind { define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_ashr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sarl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sarl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $6, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: sarl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: sarl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: sarl $6, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -604,20 +605,21 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind { define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_lshr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: shrl $5, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: shrl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrl $5, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index feac3dc..638d884 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -388,7 +388,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-NEXT: movdqa %xmm0, %xmm1 ; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNISSE-NEXT: psllw $8, %xmm1 -; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; GFNISSE-NEXT: por %xmm1, %xmm0 ; GFNISSE-NEXT: retq @@ -397,7 +397,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; GFNIAVX1-NEXT: retq @@ -1213,21 +1213,20 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: constant_shl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNISSE-NEXT: movdqa %xmm0, %xmm3 -; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3 +; GFNISSE-NEXT: pmullw %xmm2, %xmm3 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm4, %xmm3 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm0 ; GFNISSE-NEXT: psllw $8, %xmm0 ; GFNISSE-NEXT: por %xmm3, %xmm0 -; GFNISSE-NEXT: movdqa %xmm1, %xmm3 -; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3 -; GFNISSE-NEXT: pand %xmm4, %xmm3 +; GFNISSE-NEXT: pmullw %xmm1, %xmm2 +; GFNISSE-NEXT: pand %xmm4, %xmm2 ; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm1 ; GFNISSE-NEXT: psllw $8, %xmm1 -; GFNISSE-NEXT: por %xmm3, %xmm1 +; GFNISSE-NEXT: por %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: constant_shl_v32i8: @@ -1239,9 +1238,9 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] -; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 -; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] +; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3 +; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1251,14 +1250,14 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512VL-LABEL: constant_shl_v32i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX512VL-NEXT: vpsllw $8, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst) @@ -1684,15 +1683,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_shl_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -1876,15 +1874,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_lshr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -2232,36 +2230,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_ashr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; GFNIAVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift @@ -2542,9 +2520,9 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: constant_shl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,4,16,64,128,32,8,2] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNISSE-NEXT: movdqa %xmm0, %xmm6 -; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 +; GFNISSE-NEXT: pmullw %xmm4, %xmm6 ; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] @@ -2552,23 +2530,22 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-NEXT: psllw $8, %xmm0 ; GFNISSE-NEXT: por %xmm6, %xmm0 ; GFNISSE-NEXT: movdqa %xmm1, %xmm6 -; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 +; GFNISSE-NEXT: pmullw %xmm4, %xmm6 ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm1 ; GFNISSE-NEXT: psllw $8, %xmm1 ; GFNISSE-NEXT: por %xmm6, %xmm1 ; GFNISSE-NEXT: movdqa %xmm2, %xmm6 -; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 +; GFNISSE-NEXT: pmullw %xmm4, %xmm6 ; GFNISSE-NEXT: pand %xmm5, %xmm6 ; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm2 ; GFNISSE-NEXT: psllw $8, %xmm2 ; GFNISSE-NEXT: por %xmm6, %xmm2 -; GFNISSE-NEXT: movdqa %xmm3, %xmm6 -; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6 -; GFNISSE-NEXT: pand %xmm5, %xmm6 +; GFNISSE-NEXT: pmullw %xmm3, %xmm4 +; GFNISSE-NEXT: pand %xmm5, %xmm4 ; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm3 ; GFNISSE-NEXT: psllw $8, %xmm3 -; GFNISSE-NEXT: por %xmm6, %xmm3 +; GFNISSE-NEXT: por %xmm4, %xmm3 ; GFNISSE-NEXT: retq ; ; GFNIAVX1-LABEL: constant_shl_v64i8: @@ -2580,9 +2557,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm4, %xmm5 ; GFNIAVX1-NEXT: vpsllw $8, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,4,16,64,128,32,8,2] -; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; GFNIAVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 @@ -2593,8 +2570,8 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2 ; GFNIAVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm6, %xmm3 -; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm3 +; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; GFNIAVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 @@ -2602,9 +2579,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX2-LABEL: constant_shl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] -; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3 +; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] @@ -2612,7 +2589,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsllw $8, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 -; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 +; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1 @@ -2622,10 +2599,10 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512VL-LABEL: constant_shl_v64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX512VL-NEXT: # ymm2 = mem[0,1,0,1] -; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3 -; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2 +; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1] @@ -2639,7 +2616,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX512BW-LABEL: constant_shl_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; GFNIAVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 ; GFNIAVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst) diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll new file mode 100644 index 0000000..588f338 --- /dev/null +++ b/llvm/test/CodeGen/X86/haddsubsat.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s -check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=AVX2 + +define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) { +; SSSE3-LABEL: phaddsw_v8i16_intrinsic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddsw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phaddsw_v8i16_intrinsic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) { +; SSSE3-LABEL: phaddsw_v8i16_generic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddsw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phaddsw_v8i16_generic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd) + ret <8 x i16> %sum +} + +define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) { +; SSSE3-LABEL: phaddsw_v16i16_generic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddsw %xmm1, %xmm0 +; SSSE3-NEXT: phaddsw %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phaddsw_v16i16_generic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq + %even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + %sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd) + ret <16 x i16> %sum +} + +define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) { +; SSSE3-LABEL: phsubsw_v8i16_intrinsic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubsw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phsubsw_v8i16_intrinsic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) { +; SSSE3-LABEL: phsubsw_v8i16_generic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubsw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phsubsw_v8i16_generic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + %diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd) + ret <8 x i16> %diff +} + +define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) { +; SSSE3-LABEL: phsubsw_v16i16_generic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubsw %xmm1, %xmm0 +; SSSE3-NEXT: phsubsw %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phsubsw_v16i16_generic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq + %even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> + %diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd) + ret <16 x i16> %diff +} diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll index 53b70fa..c98889b 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind { ; X86-LABEL: ne_and_with_dom_abs: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movswl %ax, %ecx +; X86-NEXT: sarl $15, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 ; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpw $64, %cx -; X86-NEXT: setne %cl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpw $64, %dx +; X86-NEXT: setne %dl ; X86-NEXT: cmpl $2345, %esi # imm = 0x929 ; X86-NEXT: jae .LBB3_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movb %dl, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB3_2: ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo deleted file mode 100644 index 935b707..0000000 --- a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo +++ /dev/null @@ -1,4 +0,0 @@ -caller:0:0 - 2: sum:0 - 3: 0 __prefetch_nta_0:23456 - 3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64
\ No newline at end of file diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll b/llvm/test/CodeGen/X86/insert-prefetch-inline.ll deleted file mode 100644 index 05f5427..0000000 --- a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-inline.afdo | FileCheck %s -; -; Verify we can insert prefetch instructions in code belonging to inlined -; functions. -; -; ModuleID = 'test.cc' - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: norecurse nounwind readonly uwtable -define dso_local i32 @sum(ptr nocapture readonly %arr, i32 %pos1, i32 %pos2) local_unnamed_addr #0 !dbg !7 { -entry: - %idxprom = sext i32 %pos1 to i64, !dbg !10 - %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !10 - %0 = load i32, ptr %arrayidx, align 4, !dbg !10, !tbaa !11 - %idxprom1 = sext i32 %pos2 to i64, !dbg !15 - %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !15 - %1 = load i32, ptr %arrayidx2, align 4, !dbg !15, !tbaa !11 - %add = add nsw i32 %1, %0, !dbg !16 - ret i32 %add, !dbg !17 -} - -; "caller" inlines "sum". The associated .afdo file references instructions -; in "caller" that came from "sum"'s inlining. -; -; Function Attrs: norecurse nounwind readonly uwtable -define dso_local i32 @caller(ptr nocapture readonly %arr) local_unnamed_addr #0 !dbg !18 { -entry: - %0 = load i32, ptr %arr, align 4, !dbg !19, !tbaa !11 - %arrayidx2.i = getelementptr inbounds i32, ptr %arr, i64 2, !dbg !21 - %1 = load i32, ptr %arrayidx2.i, align 4, !dbg !21, !tbaa !11 - %add.i = add nsw i32 %1, %0, !dbg !22 - ret i32 %add.i, !dbg !23 -} - -attributes #0 = { "target-cpu"="x86-64" } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true) -!1 = !DIFile(filename: "test.cc", directory: "/tmp") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)"} -!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !8, file: !8, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0) -!8 = !DIFile(filename: "./test.h", directory: "/tmp") -!9 = !DISubroutineType(types: !2) -!10 = !DILocation(line: 6, column: 10, scope: !7) -!11 = !{!12, !12, i64 0} -!12 = !{!"int", !13, i64 0} -!13 = !{!"omnipotent char", !14, i64 0} -!14 = !{!"Simple C++ TBAA"} -!15 = !DILocation(line: 6, column: 22, scope: !7) -!16 = !DILocation(line: 6, column: 20, scope: !7) -!17 = !DILocation(line: 6, column: 3, scope: !7) -!18 = distinct !DISubprogram(name: "caller", linkageName: "caller", scope: !1, file: !1, line: 4, type: !9, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0) -!19 = !DILocation(line: 6, column: 10, scope: !7, inlinedAt: !20) -!20 = distinct !DILocation(line: 6, column: 10, scope: !18) -!21 = !DILocation(line: 6, column: 22, scope: !7, inlinedAt: !20) -!22 = !DILocation(line: 6, column: 20, scope: !7, inlinedAt: !20) -!23 = !DILocation(line: 6, column: 3, scope: !18) - -; CHECK-LABEL: caller: -; CHECK-LABEL: # %bb.0: -; CHECK-NEXT: .loc 1 6 22 prologue_end -; CHECK-NEXT: prefetchnta 23464(%rdi) -; CHECK-NEXT: movl 8(%rdi), %eax -; CHECK-NEXT: .loc 1 6 20 is_stmt 0 discriminator 2 -; CHECK-NEXT: prefetchnta 8764(%rdi) -; CHECK-NEXT: prefetchnta 64(%rdi) -; CHECK-NEXT: addl (%rdi), %eax diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo deleted file mode 100644 index 6385a49..0000000 --- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo +++ /dev/null @@ -1,2 +0,0 @@ -main:0:0 - 6: 0 __prefetch_nta_0:42
\ No newline at end of file diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll deleted file mode 100644 index f8e2502..0000000 --- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-invalid-instr.afdo | FileCheck %s -; ModuleID = 'prefetch.cc' -source_filename = "prefetch.cc" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: norecurse nounwind uwtable -define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 { -entry: - tail call void @llvm.prefetch(ptr inttoptr (i64 291 to ptr), i32 0, i32 0, i32 1), !dbg !9 - ret i32 291, !dbg !11 -} - -; Function Attrs: inaccessiblemem_or_argmemonly nounwind -declare void @llvm.prefetch(ptr nocapture readonly, i32, i32, i32) #1 - -attributes #0 = {"target-cpu"="x86-64" "target-features"="+sse4.2,+ssse3"} -attributes #1 = { inaccessiblemem_or_argmemonly nounwind } -attributes #2 = { argmemonly nounwind } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true) -!1 = !DIFile(filename: "prefetch.cc", directory: "/tmp") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 7.0.0 (trunk 327078) (llvm/trunk 327086)"} -!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0) -!8 = !DISubroutineType(types: !2) -!9 = !DILocation(line: 12, column: 3, scope: !7) -!10 = !DILocation(line: 14, column: 3, scope: !7) -!11 = !DILocation(line: 15, column: 3, scope: !7) - -;CHECK-LABEL: main: -;CHECK: # %bb.0: -;CHECK: prefetchnta 291 -;CHECK-NOT: prefetchnta 42(%rax,%ymm0) diff --git a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo b/llvm/test/CodeGen/X86/insert-prefetch-other.afdo deleted file mode 100644 index 783da34..0000000 --- a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo +++ /dev/null @@ -1,3 +0,0 @@ -sum:0:0 - 1: 0 __prefetch_t0_1:0 __prefetch_t2_0:42 - 1.1: 0 __prefetch_t1_0:18446744073709551615 diff --git a/llvm/test/CodeGen/X86/insert-prefetch.afdo b/llvm/test/CodeGen/X86/insert-prefetch.afdo deleted file mode 100644 index 96487e85..0000000 --- a/llvm/test/CodeGen/X86/insert-prefetch.afdo +++ /dev/null @@ -1,3 +0,0 @@ -sum:0:0 - 1: 0 __prefetch_nta_1:0 __prefetch_nta_0:42 - 1.1: 0 __prefetch_nta_0:18446744073709551615 diff --git a/llvm/test/CodeGen/X86/insert-prefetch.ll b/llvm/test/CodeGen/X86/insert-prefetch.ll deleted file mode 100644 index 971a619..0000000 --- a/llvm/test/CodeGen/X86/insert-prefetch.ll +++ /dev/null @@ -1,101 +0,0 @@ -; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch.afdo | FileCheck %s -; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-other.afdo | FileCheck %s -check-prefix=OTHERS -; -; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling: -; int sum(int* arr, int pos1, int pos2) { -; return arr[pos1] + arr[pos2]; -; } -; -; NOTE: debug line numbers were adjusted such that the function would start -; at line 15 (an arbitrary number). The sample profile file format uses -; offsets from the start of the symbol instead of file-relative line numbers. -; The .afdo file reflects that - the instructions are offset '1'. -; -; ModuleID = 'test.cc' -source_filename = "test.cc" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !35 !prof !37 { -entry: - %idxprom = sext i32 %pos1 to i64, !dbg !38 - %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !38 - %0 = load i32, ptr %arrayidx, align 4, !dbg !38, !tbaa !39 - %idxprom1 = sext i32 %pos2 to i64, !dbg !43 - %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !43 - %1 = load i32, ptr %arrayidx2, align 4, !dbg !43, !tbaa !39 - %add = add nsw i32 %1, %0, !dbg !44 - ret i32 %add, !dbg !45 -} - -attributes #0 = { "target-cpu"="x86-64" } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5, !6} -!llvm.ident = !{!33} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true) -!1 = !DIFile(filename: "test.cc", directory: "/tmp") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{i32 1, !"ProfileSummary", !7} -!7 = !{!8, !9, !10, !11, !12, !13, !14, !15} -!8 = !{!"ProfileFormat", !"SampleProfile"} -!9 = !{!"TotalCount", i64 0} -!10 = !{!"MaxCount", i64 0} -!11 = !{!"MaxInternalCount", i64 0} -!12 = !{!"MaxFunctionCount", i64 0} -!13 = !{!"NumCounts", i64 2} -!14 = !{!"NumFunctions", i64 1} -!15 = !{!"DetailedSummary", !16} -!16 = !{!17, !18, !19, !20, !21, !22, !22, !23, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32} -!17 = !{i32 10000, i64 0, i32 0} -!18 = !{i32 100000, i64 0, i32 0} -!19 = !{i32 200000, i64 0, i32 0} -!20 = !{i32 300000, i64 0, i32 0} -!21 = !{i32 400000, i64 0, i32 0} -!22 = !{i32 500000, i64 0, i32 0} -!23 = !{i32 600000, i64 0, i32 0} -!24 = !{i32 700000, i64 0, i32 0} -!25 = !{i32 800000, i64 0, i32 0} -!26 = !{i32 900000, i64 0, i32 0} -!27 = !{i32 950000, i64 0, i32 0} -!28 = !{i32 990000, i64 0, i32 0} -!29 = !{i32 999000, i64 0, i32 0} -!30 = !{i32 999900, i64 0, i32 0} -!31 = !{i32 999990, i64 0, i32 0} -!32 = !{i32 999999, i64 0, i32 0} -!33 = !{!"clang version 7.0.0 (trunk 322593) (llvm/trunk 322526)"} -!35 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 15, type: !36, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0) -!36 = !DISubroutineType(types: !2) -!37 = !{!"function_entry_count", i64 -1} -!38 = !DILocation(line: 16, column: 10, scope: !35) -!39 = !{!40, !40, i64 0} -!40 = !{!"int", !41, i64 0} -!41 = !{!"omnipotent char", !42, i64 0} -!42 = !{!"Simple C++ TBAA"} -!43 = !DILocation(line: 16, column: 22, scope: !35) -!44 = !DILocation(line: 16, column: 20, scope: !35) -!45 = !DILocation(line: 16, column: 3, scope: !35) - -;CHECK-LABEL: sum: -;CHECK: # %bb.0: -;CHECK: prefetchnta 42(%rdi,%rax,4) -;CHECK-NEXT: prefetchnta (%rdi,%rax,4) -;CHECK-NEXT: movl (%rdi,%rax,4), %eax -;CHECK-NEXT: .loc 1 16 20 discriminator 2 # test.cc:16:20 -;CHECK-NEXT: prefetchnta -1(%rdi,%rcx,4) -;CHECK-NEXT: addl (%rdi,%rcx,4), %eax -;CHECK-NEXT: .loc 1 16 3 # test.cc:16:3 - -;OTHERS-LABEL: sum: -;OTHERS: # %bb.0: -;OTHERS: prefetcht2 42(%rdi,%rax,4) -;OTHERS-NEXT: prefetcht0 (%rdi,%rax,4) -;OTHERS-NEXT: movl (%rdi,%rax,4), %eax -;OTHERS-NEXT: .loc 1 16 20 discriminator 2 # test.cc:16:20 -;OTHERS-NEXT: prefetcht1 -1(%rdi,%rcx,4) -;OTHERS-NEXT: addl (%rdi,%rcx,4), %eax -;OTHERS-NEXT: .loc 1 16 3 # test.cc:16:3 diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll index e73ff79..f270f8f 100644 --- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll +++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-unknown" declare void @bar1() define preserve_allcc void @foo()#0 { -; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void @@ -15,7 +15,7 @@ define preserve_allcc void @foo()#0 { declare void @bar2() define preserve_nonecc void @foo2()#0 { -; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh +; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh call void @bar1() call void @bar2() ret void diff --git a/llvm/test/CodeGen/X86/isel-arg-attrs.ll b/llvm/test/CodeGen/X86/isel-arg-attrs.ll new file mode 100644 index 0000000..3afee76 --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-arg-attrs.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X64 + +; The src array should be in R10 or ECX register due to nest attribute +define i32 @nest_arg(ptr nest %src) { +; X86-LABEL: nest_arg: +; X86: # %bb.0: +; X86-NEXT: movl 8(%ecx), %eax +; X86-NEXT: retl +; +; X64-LABEL: nest_arg: +; X64: # %bb.0: +; X64-NEXT: movl 8(%r10), %eax +; X64-NEXT: retq + %off = getelementptr [3 x i32], ptr %src, i32 0, i32 2 + %ret = load i32, ptr %off + ret i32 %ret +} diff --git a/llvm/test/CodeGen/X86/isel-icmp.ll b/llvm/test/CodeGen/X86/isel-icmp.ll index 8a4d035..065d701 100644 --- a/llvm/test/CodeGen/X86/isel-icmp.ll +++ b/llvm/test/CodeGen/X86/isel-icmp.ll @@ -1,11 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefix=SDAG-X64 -; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefix=FAST-X64 -; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X64 -; RUN: llc < %s -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=SDAG-X86 +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=SDAG-X64 +; Allow fast-isel to fallback to selection dag on x86 for i96 type. +; RUN: llc < %s -fast-isel -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=FAST-X64 +; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=GISEL-X64 +; RUN: llc < %s -mtriple=i686-apple-darwin10 | FileCheck %s --check-prefixes=SDAG-X86 ; Allow fast-isel to fallback to selection dag on x86 -; RUN: llc < %s -fast-isel -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=FAST-X86 -; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X86 +; RUN: llc < %s -fast-isel -mtriple=i686-apple-darwin10 | FileCheck %s --check-prefixes=FAST-X86 +; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 | FileCheck %s --check-prefixes=GISEL-X86 define i32 @test_icmp_eq_i8(i8 %a, i8 %b) { ; SDAG-X64-LABEL: test_icmp_eq_i8: @@ -720,3 +721,168 @@ define i32 @test_icmp_sle_i32(i32 %a, i32 %b) { %res = zext i1 %r to i32 ret i32 %res } + +; PR167326 +define i32 @test_icmp_sge_i96(i96 %a, i96 %b) nounwind { +; SDAG-X64-LABEL: test_icmp_sge_i96: +; SDAG-X64: ## %bb.0: +; SDAG-X64-NEXT: movslq %ecx, %rax +; SDAG-X64-NEXT: movslq %esi, %rcx +; SDAG-X64-NEXT: cmpq %rdx, %rdi +; SDAG-X64-NEXT: sbbq %rax, %rcx +; SDAG-X64-NEXT: setge %al +; SDAG-X64-NEXT: movzbl %al, %eax +; SDAG-X64-NEXT: retq +; +; FAST-X64-LABEL: test_icmp_sge_i96: +; FAST-X64: ## %bb.0: +; FAST-X64-NEXT: movslq %ecx, %rax +; FAST-X64-NEXT: movslq %esi, %rcx +; FAST-X64-NEXT: cmpq %rdx, %rdi +; FAST-X64-NEXT: sbbq %rax, %rcx +; FAST-X64-NEXT: setge %al +; FAST-X64-NEXT: andb $1, %al +; FAST-X64-NEXT: movzbl %al, %eax +; FAST-X64-NEXT: retq +; +; GISEL-X64-LABEL: test_icmp_sge_i96: +; GISEL-X64: ## %bb.0: +; GISEL-X64-NEXT: movq %rcx, %rax +; GISEL-X64-NEXT: movq %rdi, %r8 +; GISEL-X64-NEXT: movb $32, %cl +; GISEL-X64-NEXT: shlq %cl, %r8 +; GISEL-X64-NEXT: shlq %cl, %rsi +; GISEL-X64-NEXT: shrq %cl, %rdi +; GISEL-X64-NEXT: orq %rsi, %rdi +; GISEL-X64-NEXT: shrq %cl, %r8 +; GISEL-X64-NEXT: movq %rdi, %rsi +; GISEL-X64-NEXT: shlq %cl, %rsi +; GISEL-X64-NEXT: orq %r8, %rsi +; GISEL-X64-NEXT: sarq %cl, %rdi +; GISEL-X64-NEXT: movq %rdx, %rcx +; GISEL-X64-NEXT: shlq $32, %rcx +; GISEL-X64-NEXT: shlq $32, %rax +; GISEL-X64-NEXT: shrq $32, %rdx +; GISEL-X64-NEXT: orq %rax, %rdx +; GISEL-X64-NEXT: shrq $32, %rcx +; GISEL-X64-NEXT: movq %rdx, %rax +; GISEL-X64-NEXT: shlq $32, %rax +; GISEL-X64-NEXT: orq %rcx, %rax +; GISEL-X64-NEXT: sarq $32, %rdx +; GISEL-X64-NEXT: xorl %ecx, %ecx +; GISEL-X64-NEXT: cmpq %rax, %rsi +; GISEL-X64-NEXT: setae %cl +; GISEL-X64-NEXT: xorl %eax, %eax +; GISEL-X64-NEXT: xorl %esi, %esi +; GISEL-X64-NEXT: cmpq %rdx, %rdi +; GISEL-X64-NEXT: setge %al +; GISEL-X64-NEXT: sete %sil +; GISEL-X64-NEXT: testl %esi, %esi +; GISEL-X64-NEXT: cmovnew %cx, %ax +; GISEL-X64-NEXT: andl $1, %eax +; GISEL-X64-NEXT: retq +; +; SDAG-X86-LABEL: test_icmp_sge_i96: +; SDAG-X86: ## %bb.0: +; SDAG-X86-NEXT: pushl %ebx +; SDAG-X86-NEXT: pushl %edi +; SDAG-X86-NEXT: pushl %esi +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl %eax, %ecx +; SDAG-X86-NEXT: sarl $31, %ecx +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; SDAG-X86-NEXT: movl %edx, %esi +; SDAG-X86-NEXT: sarl $31, %esi +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; SDAG-X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; SDAG-X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; SDAG-X86-NEXT: sbbl %eax, %edx +; SDAG-X86-NEXT: sbbl %ecx, %esi +; SDAG-X86-NEXT: setge %al +; SDAG-X86-NEXT: movzbl %al, %eax +; SDAG-X86-NEXT: popl %esi +; SDAG-X86-NEXT: popl %edi +; SDAG-X86-NEXT: popl %ebx +; SDAG-X86-NEXT: retl +; +; FAST-X86-LABEL: test_icmp_sge_i96: +; FAST-X86: ## %bb.0: +; FAST-X86-NEXT: pushl %ebx +; FAST-X86-NEXT: pushl %edi +; FAST-X86-NEXT: pushl %esi +; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; FAST-X86-NEXT: movl %eax, %edi +; FAST-X86-NEXT: sarl $31, %edi +; FAST-X86-NEXT: movl %edx, %ebx +; FAST-X86-NEXT: sarl $31, %ebx +; FAST-X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp) +; FAST-X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; FAST-X86-NEXT: sbbl %eax, %edx +; FAST-X86-NEXT: sbbl %edi, %ebx +; FAST-X86-NEXT: setge %al +; FAST-X86-NEXT: andb $1, %al +; FAST-X86-NEXT: movzbl %al, %eax +; FAST-X86-NEXT: popl %esi +; FAST-X86-NEXT: popl %edi +; FAST-X86-NEXT: popl %ebx +; FAST-X86-NEXT: retl +; +; GISEL-X86-LABEL: test_icmp_sge_i96: +; GISEL-X86: ## %bb.0: +; GISEL-X86-NEXT: pushl %ebp +; GISEL-X86-NEXT: pushl %ebx +; GISEL-X86-NEXT: pushl %edi +; GISEL-X86-NEXT: pushl %esi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; GISEL-X86-NEXT: movl %edx, %eax +; GISEL-X86-NEXT: movb $31, %cl +; GISEL-X86-NEXT: sarl %cl, %eax +; GISEL-X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setae %ch +; GISEL-X86-NEXT: xorl %ebx, %ebx +; GISEL-X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: setae %cl +; GISEL-X86-NEXT: sete %bl +; GISEL-X86-NEXT: testl %ebx, %ebx +; GISEL-X86-NEXT: je LBB13_2 +; GISEL-X86-NEXT: ## %bb.1: +; GISEL-X86-NEXT: movb %ch, %cl +; GISEL-X86-NEXT: LBB13_2: +; GISEL-X86-NEXT: movl %esi, %edi +; GISEL-X86-NEXT: sarl $31, %edi +; GISEL-X86-NEXT: xorl %ebx, %ebx +; GISEL-X86-NEXT: cmpl %esi, %edx +; GISEL-X86-NEXT: setae %dl +; GISEL-X86-NEXT: sete %bl +; GISEL-X86-NEXT: testl %ebx, %ebx +; GISEL-X86-NEXT: je LBB13_4 +; GISEL-X86-NEXT: ## %bb.3: +; GISEL-X86-NEXT: movl %ecx, %edx +; GISEL-X86-NEXT: LBB13_4: +; GISEL-X86-NEXT: xorl %ecx, %ecx +; GISEL-X86-NEXT: cmpl %edi, %eax +; GISEL-X86-NEXT: setge %al +; GISEL-X86-NEXT: sete %cl +; GISEL-X86-NEXT: testl %ecx, %ecx +; GISEL-X86-NEXT: je LBB13_6 +; GISEL-X86-NEXT: ## %bb.5: +; GISEL-X86-NEXT: movl %edx, %eax +; GISEL-X86-NEXT: LBB13_6: +; GISEL-X86-NEXT: movzbl %al, %eax +; GISEL-X86-NEXT: andl $1, %eax +; GISEL-X86-NEXT: popl %esi +; GISEL-X86-NEXT: popl %edi +; GISEL-X86-NEXT: popl %ebx +; GISEL-X86-NEXT: popl %ebp +; GISEL-X86-NEXT: retl + %r = icmp sge i96 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} diff --git a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll index 065710f..8576f8f 100644 --- a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll +++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll @@ -3,6 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64 +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-SINCOS-STRET +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-NOSINCOS-STRET + ; TODO: The below RUN line will fails GISEL selection and will fallback to DAG selection due to lack of support for loads/stores in i686 mode, support is expected soon enough, for this reason the llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir test is added for now because of the lack of support for i686 in GlobalISel. ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64 @@ -34,6 +37,29 @@ define { float, float } @test_sincos_f32(float %Val) nounwind { ; X64-NEXT: popq %rax ; X64-NEXT: retq ; +; MACOS-SINCOS-STRET-LABEL: test_sincos_f32: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: pushq %rax +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; MACOS-SINCOS-STRET-NEXT: popq %rax +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f32: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: pushq %rax +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1 +; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: popq %rax +; MACOS-NOSINCOS-STRET-NEXT: retq +; ; GISEL-X86-LABEL: test_sincos_f32: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: subl $28, %esp @@ -93,6 +119,28 @@ define { double, double } @test_sincos_f64(double %Val) nounwind { ; X64-NEXT: addq $24, %rsp ; X64-NEXT: retq ; +; MACOS-SINCOS-STRET-LABEL: test_sincos_f64: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: pushq %rax +; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret +; MACOS-SINCOS-STRET-NEXT: popq %rax +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f64: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: subq $24, %rsp +; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _sin +; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero +; MACOS-NOSINCOS-STRET-NEXT: callq _cos +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1 +; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero +; MACOS-NOSINCOS-STRET-NEXT: addq $24, %rsp +; MACOS-NOSINCOS-STRET-NEXT: retq +; ; GISEL-X86-LABEL: test_sincos_f64: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: subl $44, %esp @@ -153,6 +201,40 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind { ; X64-NEXT: addq $56, %rsp ; X64-NEXT: retq ; +; MACOS-SINCOS-STRET-LABEL: test_sincos_f80: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp +; MACOS-SINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp) +; MACOS-SINCOS-STRET-NEXT: fld %st(0) +; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill +; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp) +; MACOS-SINCOS-STRET-NEXT: callq _cosl +; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill +; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp) +; MACOS-SINCOS-STRET-NEXT: callq _sinl +; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: fxch %st(1) +; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f80: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: subq $40, %rsp +; MACOS-NOSINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp) +; MACOS-NOSINCOS-STRET-NEXT: fld %st(0) +; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill +; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp) +; MACOS-NOSINCOS-STRET-NEXT: callq _cosl +; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill +; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp) +; MACOS-NOSINCOS-STRET-NEXT: callq _sinl +; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: fxch %st(1) +; MACOS-NOSINCOS-STRET-NEXT: addq $40, %rsp +; MACOS-NOSINCOS-STRET-NEXT: retq +; ; GISEL-X86-LABEL: test_sincos_f80: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: subl $60, %esp @@ -288,6 +370,57 @@ define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias % ; SDAG-X64-NEXT: popq %r14 ; SDAG-X64-NEXT: retq ; +; MACOS-SINCOS-STRET-LABEL: can_fold_with_call_in_chain: +; MACOS-SINCOS-STRET: ## %bb.0: ## %entry +; MACOS-SINCOS-STRET-NEXT: pushq %r14 +; MACOS-SINCOS-STRET-NEXT: pushq %rbx +; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp +; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movq %r14, %rdi +; MACOS-SINCOS-STRET-NEXT: movq %rbx, %rsi +; MACOS-SINCOS-STRET-NEXT: callq _foo +; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%r14) +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%rbx) +; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp +; MACOS-SINCOS-STRET-NEXT: popq %rbx +; MACOS-SINCOS-STRET-NEXT: popq %r14 +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: can_fold_with_call_in_chain: +; MACOS-NOSINCOS-STRET: ## %bb.0: ## %entry +; MACOS-NOSINCOS-STRET-NEXT: pushq %r14 +; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx +; MACOS-NOSINCOS-STRET-NEXT: pushq %rax +; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movq %r14, %rdi +; MACOS-NOSINCOS-STRET-NEXT: movq %rbx, %rsi +; MACOS-NOSINCOS-STRET-NEXT: callq _foo +; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%r14) +; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero +; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rbx) +; MACOS-NOSINCOS-STRET-NEXT: addq $8, %rsp +; MACOS-NOSINCOS-STRET-NEXT: popq %rbx +; MACOS-NOSINCOS-STRET-NEXT: popq %r14 +; MACOS-NOSINCOS-STRET-NEXT: retq +; ; GISEL-X86-LABEL: can_fold_with_call_in_chain: ; GISEL-X86: # %bb.0: # %entry ; GISEL-X86-NEXT: pushl %ebx diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll index 8b1e69a..5d216a2 100644 --- a/llvm/test/CodeGen/X86/kmov.ll +++ b/llvm/test/CodeGen/X86/kmov.ll @@ -477,16 +477,13 @@ define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) { ; X64-AVX512-LABEL: invert_i64_mask_extract_32: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: kmovq %rdi, %k0 -; X64-AVX512-NEXT: knotb %k0, %k1 -; X64-AVX512-NEXT: kshiftrd $8, %k0, %k2 -; X64-AVX512-NEXT: knotb %k2, %k2 -; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1 +; X64-AVX512-NEXT: kshiftrd $8, %k0, %k1 +; X64-AVX512-NEXT: kunpckbw %k0, %k1, %k1 ; X64-AVX512-NEXT: kshiftrd $16, %k0, %k2 -; X64-AVX512-NEXT: knotb %k2, %k2 ; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0 -; X64-AVX512-NEXT: knotb %k0, %k0 ; X64-AVX512-NEXT: kunpckbw %k2, %k0, %k0 ; X64-AVX512-NEXT: kunpckwd %k1, %k0, %k0 +; X64-AVX512-NEXT: knotd %k0, %k0 ; X64-AVX512-NEXT: vpmovm2b %k0, %ymm0 ; X64-AVX512-NEXT: retq ; @@ -495,18 +492,16 @@ define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) { ; X64-KNL-NEXT: movl %edi, %eax ; X64-KNL-NEXT: shrl $16, %eax ; X64-KNL-NEXT: kmovw %eax, %k0 -; X64-KNL-NEXT: knotw %k0, %k0 ; X64-KNL-NEXT: movl %edi, %eax ; X64-KNL-NEXT: shrl $24, %eax ; X64-KNL-NEXT: kmovw %eax, %k1 -; X64-KNL-NEXT: knotw %k1, %k1 -; X64-KNL-NEXT: kunpckbw %k0, %k1, %k1 +; X64-KNL-NEXT: kunpckbw %k0, %k1, %k0 +; X64-KNL-NEXT: knotw %k0, %k1 ; X64-KNL-NEXT: kmovw %edi, %k0 -; X64-KNL-NEXT: knotw %k0, %k0 ; X64-KNL-NEXT: shrl $8, %edi ; X64-KNL-NEXT: kmovw %edi, %k2 -; X64-KNL-NEXT: knotw %k2, %k2 -; X64-KNL-NEXT: kunpckbw %k0, %k2, %k2 +; X64-KNL-NEXT: kunpckbw %k0, %k2, %k0 +; X64-KNL-NEXT: knotw %k0, %k2 ; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0 ; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 @@ -586,27 +581,20 @@ define <64 x i1> @invert_i64_mask_extract_64(i64 %mask) { ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: kmovq %rdi, %k0 ; X64-AVX512-NEXT: kshiftrq $32, %k0, %k1 -; X64-AVX512-NEXT: knotb %k1, %k1 ; X64-AVX512-NEXT: kshiftrq $40, %k0, %k2 -; X64-AVX512-NEXT: knotb %k2, %k2 ; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1 ; X64-AVX512-NEXT: kshiftrq $48, %k0, %k2 -; X64-AVX512-NEXT: knotb %k2, %k2 ; X64-AVX512-NEXT: kshiftrq $56, %k0, %k3 -; X64-AVX512-NEXT: knotb %k3, %k3 ; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2 ; X64-AVX512-NEXT: kunpckwd %k1, %k2, %k1 -; X64-AVX512-NEXT: knotb %k0, %k2 -; X64-AVX512-NEXT: kshiftrd $8, %k0, %k3 -; X64-AVX512-NEXT: knotb %k3, %k3 -; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2 +; X64-AVX512-NEXT: kshiftrd $8, %k0, %k2 +; X64-AVX512-NEXT: kunpckbw %k0, %k2, %k2 ; X64-AVX512-NEXT: kshiftrd $16, %k0, %k3 -; X64-AVX512-NEXT: knotb %k3, %k3 ; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0 -; X64-AVX512-NEXT: knotb %k0, %k0 ; X64-AVX512-NEXT: kunpckbw %k3, %k0, %k0 ; X64-AVX512-NEXT: kunpckwd %k2, %k0, %k0 ; X64-AVX512-NEXT: kunpckdq %k0, %k1, %k0 +; X64-AVX512-NEXT: knotq %k0, %k0 ; X64-AVX512-NEXT: vpmovm2b %k0, %zmm0 ; X64-AVX512-NEXT: retq ; @@ -614,38 +602,34 @@ define <64 x i1> @invert_i64_mask_extract_64(i64 %mask) { ; X64-KNL: # %bb.0: ; X64-KNL-NEXT: movq %rdi, %rax ; X64-KNL-NEXT: kmovw %esi, %k0 -; X64-KNL-NEXT: knotw %k0, %k0 ; X64-KNL-NEXT: movl %esi, %ecx ; X64-KNL-NEXT: shrl $8, %ecx ; X64-KNL-NEXT: kmovw %ecx, %k1 -; X64-KNL-NEXT: knotw %k1, %k1 ; X64-KNL-NEXT: kunpckbw %k0, %k1, %k0 +; X64-KNL-NEXT: knotw %k0, %k0 ; X64-KNL-NEXT: movl %esi, %ecx ; X64-KNL-NEXT: shrl $16, %ecx ; X64-KNL-NEXT: kmovw %ecx, %k1 -; X64-KNL-NEXT: knotw %k1, %k1 ; X64-KNL-NEXT: movl %esi, %ecx ; X64-KNL-NEXT: shrl $24, %ecx ; X64-KNL-NEXT: kmovw %ecx, %k2 -; X64-KNL-NEXT: knotw %k2, %k2 ; X64-KNL-NEXT: kunpckbw %k1, %k2, %k1 +; X64-KNL-NEXT: knotw %k1, %k1 ; X64-KNL-NEXT: movq %rsi, %rcx ; X64-KNL-NEXT: shrq $32, %rcx ; X64-KNL-NEXT: kmovw %ecx, %k2 -; X64-KNL-NEXT: knotw %k2, %k2 ; X64-KNL-NEXT: movq %rsi, %rcx ; X64-KNL-NEXT: shrq $40, %rcx ; X64-KNL-NEXT: kmovw %ecx, %k3 -; X64-KNL-NEXT: knotw %k3, %k3 ; X64-KNL-NEXT: kunpckbw %k2, %k3, %k2 +; X64-KNL-NEXT: knotw %k2, %k2 ; X64-KNL-NEXT: movq %rsi, %rcx ; X64-KNL-NEXT: shrq $48, %rcx ; X64-KNL-NEXT: kmovw %ecx, %k3 -; X64-KNL-NEXT: knotw %k3, %k3 ; X64-KNL-NEXT: shrq $56, %rsi ; X64-KNL-NEXT: kmovw %esi, %k4 -; X64-KNL-NEXT: knotw %k4, %k4 ; X64-KNL-NEXT: kunpckbw %k3, %k4, %k3 +; X64-KNL-NEXT: knotw %k3, %k3 ; X64-KNL-NEXT: kmovw %k3, 6(%rdi) ; X64-KNL-NEXT: kmovw %k2, 4(%rdi) ; X64-KNL-NEXT: kmovw %k1, 2(%rdi) diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll new file mode 100644 index 0000000..bb6dc31 --- /dev/null +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -0,0 +1,312 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512FP16 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VLF +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512VLFP16 + +define half @test_half(half %x, i32 %exp) nounwind { +; AVX512F-LABEL: test_half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: test_half: +; AVX512FP16: # %bb.0: # %entry +; AVX512FP16-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1 +; AVX512FP16-NEXT: vscalefsh %xmm1, %xmm0, %xmm0 +; AVX512FP16-NEXT: retq +; +; AVX512VL-LABEL: test_half: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_half: +; AVX512VLFP16: # %bb.0: # %entry +; AVX512VLFP16-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1 +; AVX512VLFP16-NEXT: vscalefsh %xmm1, %xmm0, %xmm0 +; AVX512VLFP16-NEXT: retq +entry: + %r = tail call fast half @llvm.ldexp.f16.i32(half %x, i32 %exp) + ret half %r +} +declare half @llvm.ldexp.f16.i32(half, i32) memory(none) + +define float @test_float(float %x, i32 %exp) nounwind { +; CHECK-LABEL: test_float: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 +; CHECK-NEXT: vscalefss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq +entry: + %r = tail call fast float @ldexpf(float %x, i32 %exp) + ret float %r +} +declare float @ldexpf(float, i32) memory(none) + +define double @test_double(double %x, i32 %exp) nounwind { +; CHECK-LABEL: test_double: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtsi2sd %edi, %xmm15, %xmm1 +; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq +entry: + %r = tail call fast double @ldexp(double %x, i32 %exp) + ret double %r +} +declare double @ldexp(double, i32) memory(none) + +define fp128 @testExpl(fp128 %x, i32 %exp) nounwind { +; CHECK-LABEL: testExpl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: jmp ldexpl@PLT # TAILCALL +entry: + %r = tail call fast fp128 @ldexpl(fp128 %x, i32 %exp) + ret fp128 %r +} +declare fp128 @ldexpl(fp128, i32) memory(none) + +define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind { +; AVX512F-LABEL: test_ldexp_8xhalf: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: test_ldexp_8xhalf: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512FP16-NEXT: vinsertf32x4 $0, %xmm0, %zmm2, %zmm0 +; AVX512FP16-NEXT: vmovaps %xmm1, %xmm1 +; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0 +; AVX512FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512FP16-NEXT: vzeroupper +; AVX512FP16-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_8xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_8xhalf: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtw2ph %xmm1, %xmm1 +; AVX512VLFP16-NEXT: vscalefph %xmm1, %xmm0, %xmm0 +; AVX512VLFP16-NEXT: retq + %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp) + ret <8 x half> %r +} +declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) + +define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind { +; AVX512-LABEL: test_ldexp_4xfloat: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovaps %xmm1, %xmm1 +; AVX512-NEXT: vmovaps %xmm0, %xmm0 +; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_4xfloat: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512VL-NEXT: vscalefps %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_4xfloat: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtdq2ps %xmm1, %xmm1 +; AVX512VLFP16-NEXT: vscalefps %xmm1, %xmm0, %xmm0 +; AVX512VLFP16-NEXT: retq + %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp) + ret <4 x float> %r +} +declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) + +define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_2xdouble: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2pd %xmm1, %xmm2 +; CHECK-NEXT: vscalefsd %xmm2, %xmm0, %xmm2 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: vcvtdq2pd %xmm1, %xmm1 +; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; CHECK-NEXT: retq + %r = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %x, <2 x i32> %exp) + ret <2 x double> %r +} +declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) + +define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind { +; AVX512F-LABEL: test_ldexp_16xhalf: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: test_ldexp_16xhalf: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512FP16-NEXT: vinsertf64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512FP16-NEXT: vmovaps %ymm1, %ymm1 +; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0 +; AVX512FP16-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512FP16-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_16xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_16xhalf: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtw2ph %ymm1, %ymm1 +; AVX512VLFP16-NEXT: vscalefph %ymm1, %ymm0, %ymm0 +; AVX512VLFP16-NEXT: retq + %r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp) + ret <16 x half> %r +} +declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>) + +define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind { +; AVX512-LABEL: test_ldexp_8xfloat: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovaps %ymm1, %ymm1 +; AVX512-NEXT: vmovaps %ymm0, %ymm0 +; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_8xfloat: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX512VL-NEXT: vscalefps %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_8xfloat: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX512VLFP16-NEXT: vscalefps %ymm1, %ymm0, %ymm0 +; AVX512VLFP16-NEXT: retq + %r = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %x, <8 x i32> %exp) + ret <8 x float> %r +} +declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) + +define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind { +; AVX512-LABEL: test_ldexp_4xdouble: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovapd %xmm1, %xmm1 +; AVX512-NEXT: vmovapd %ymm0, %ymm0 +; AVX512-NEXT: vscalefpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_4xdouble: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX512VL-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_4xdouble: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtdq2pd %xmm1, %ymm1 +; AVX512VLFP16-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 +; AVX512VLFP16-NEXT: retq + %r = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %x, <4 x i32> %exp) + ret <4 x double> %r +} +declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) + +define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind { +; AVX512F-LABEL: test_ldexp_32xhalf: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm2 +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm3 +; AVX512F-NEXT: vscalefps %zmm2, %zmm3, %zmm2 +; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: test_ldexp_32xhalf: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vcvtw2ph %zmm1, %zmm1 +; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0 +; AVX512FP16-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_32xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm2 +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm3 +; AVX512VL-NEXT: vscalefps %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vcvtps2ph $4, %zmm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512VLFP16-LABEL: test_ldexp_32xhalf: +; AVX512VLFP16: # %bb.0: +; AVX512VLFP16-NEXT: vcvtw2ph %zmm1, %zmm1 +; AVX512VLFP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0 +; AVX512VLFP16-NEXT: retq + %r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp) + ret <32 x half> %r +} +declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>) + +define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_16xfloat: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ps %zmm1, %zmm1 +; CHECK-NEXT: vscalefps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %r = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %x, <16 x i32> %exp) + ret <16 x float> %r +} +declare <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>) + +define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwind { +; CHECK-LABEL: test_ldexp_8xdouble: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 +; CHECK-NEXT: vscalefpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %r = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %x, <8 x i32> %exp) + ret <8 x double> %r +} +declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>) + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512VLF: {{.*}} diff --git a/llvm/test/CodeGen/X86/llc-accept-avx10-512.ll b/llvm/test/CodeGen/X86/llc-accept-avx10-512.ll new file mode 100644 index 0000000..b5c9895 --- /dev/null +++ b/llvm/test/CodeGen/X86/llc-accept-avx10-512.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 + +; avx10.x-512 is just avx10.x -- 512 is kept for compatibility purposes. + +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 2>&1 | FileCheck --check-prefixes=CHECK-AVX10_1 %s + +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 2>&1 | FileCheck --check-prefixes=CHECK-AVX10_2 %s + +; CHECK-AVX10_1-NOT: is not recognizable +; CHECK-AVX10_2-NOT: is not recognizable + +define <32 x bfloat> @foo_avx10.1(<16 x float> %a, <16 x float> %b) { +; CHECK-AVX10_1-LABEL: foo_avx10.1: +; CHECK-AVX10_1: # %bb.0: +; CHECK-AVX10_1-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0 +; CHECK-AVX10_1-NEXT: retq +; +; CHECK-AVX10_2-LABEL: foo_avx10.1: +; CHECK-AVX10_2: # %bb.0: +; CHECK-AVX10_2-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0 +; CHECK-AVX10_2-NEXT: retq + %ret = call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a, <16 x float> %b) + ret <32 x bfloat> %ret +} + +define <8 x i32> @foo_avx10.2(<8 x double> %f) { +; CHECK-AVX10_1-LABEL: foo_avx10.2: +; CHECK-AVX10_1: # %bb.0: +; CHECK-AVX10_1-NEXT: vextractf32x4 $2, %zmm0, %xmm1 +; CHECK-AVX10_1-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; CHECK-AVX10_1-NEXT: vmovsd {{.*#+}} xmm3 = [-2.147483648E+9,0.0E+0] +; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm2, %xmm4 +; CHECK-AVX10_1-NEXT: vmovsd {{.*#+}} xmm5 = [2.147483647E+9,0.0E+0] +; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4 +; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx +; CHECK-AVX10_1-NEXT: xorl %eax, %eax +; CHECK-AVX10_1-NEXT: vucomisd %xmm2, %xmm2 +; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx +; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm1, %xmm2 +; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm2, %xmm2 +; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm2, %edx +; CHECK-AVX10_1-NEXT: vucomisd %xmm1, %xmm1 +; CHECK-AVX10_1-NEXT: cmovpl %eax, %edx +; CHECK-AVX10_1-NEXT: vmovd %edx, %xmm1 +; CHECK-AVX10_1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; CHECK-AVX10_1-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm2, %xmm4 +; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4 +; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx +; CHECK-AVX10_1-NEXT: vucomisd %xmm2, %xmm2 +; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx +; CHECK-AVX10_1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; CHECK-AVX10_1-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] +; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm2, %xmm4 +; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4 +; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx +; CHECK-AVX10_1-NEXT: vucomisd %xmm2, %xmm2 +; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx +; CHECK-AVX10_1-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 +; CHECK-AVX10_1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] +; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm2, %xmm4 +; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4 +; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx +; CHECK-AVX10_1-NEXT: vucomisd %xmm2, %xmm2 +; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx +; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm0, %xmm2 +; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm2, %xmm2 +; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm2, %edx +; CHECK-AVX10_1-NEXT: vucomisd %xmm0, %xmm0 +; CHECK-AVX10_1-NEXT: cmovpl %eax, %edx +; CHECK-AVX10_1-NEXT: vmovd %edx, %xmm2 +; CHECK-AVX10_1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; CHECK-AVX10_1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm0, %xmm4 +; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4 +; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx +; CHECK-AVX10_1-NEXT: vucomisd %xmm0, %xmm0 +; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx +; CHECK-AVX10_1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; CHECK-AVX10_1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm0, %xmm3 +; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm3, %xmm3 +; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm3, %ecx +; CHECK-AVX10_1-NEXT: vucomisd %xmm0, %xmm0 +; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx +; CHECK-AVX10_1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; CHECK-AVX10_1-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-AVX10_1-NEXT: retq +; +; CHECK-AVX10_2-LABEL: foo_avx10.2: +; CHECK-AVX10_2: # %bb.0: +; CHECK-AVX10_2-NEXT: vcvttpd2dqs %zmm0, %ymm0 +; CHECK-AVX10_2-NEXT: retq + %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> %f) + ret <8 x i32> %x +} + diff --git a/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll b/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll new file mode 100644 index 0000000..2802593 --- /dev/null +++ b/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast 2>&1 | grep "X86 backend ignores --fp-contract" + +; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=off 2>&1 | grep "X86 backend ignores --fp-contract" + +; on, as a default setting that's passed to backend when no --fp-contract option is specified, is not diagnosed. +; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=on 2>&1 | grep -v "X86 backend ignores --fp-contract" + +define float @foo(float %f) { + %res = fadd float %f, %f + ret float %res +} + diff --git a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll index 834dd78..9b02438 100644 --- a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll +++ b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll @@ -1,59 +1,213 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 5 -; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck -check-prefix=X86 %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck -check-prefix=X64 %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 | FileCheck --check-prefix=MACOS-SINCOS-STRET %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 | FileCheck --check-prefix=MACOS-NOSINCOS-STRET %s define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind { -; CHECK-LABEL: test_sincos_v4f32: -; CHECK: # %bb.0: -; CHECK-NEXT: pushl %edi -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $52, %esp -; CHECK-NEXT: movl 84(%esp), %esi -; CHECK-NEXT: flds 76(%esp) -; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: flds 64(%esp) -; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: flds 72(%esp) -; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: flds 68(%esp) -; CHECK-NEXT: movl 80(%esp), %edi -; CHECK-NEXT: leal 40(%esp), %eax -; CHECK-NEXT: movl %eax, 8(%esp) -; CHECK-NEXT: leal 4(%edi), %eax -; CHECK-NEXT: movl %eax, 4(%esp) -; CHECK-NEXT: fstps (%esp) -; CHECK-NEXT: calll sincosf -; CHECK-NEXT: leal 44(%esp), %eax -; CHECK-NEXT: movl %eax, 8(%esp) -; CHECK-NEXT: leal 8(%edi), %eax -; CHECK-NEXT: movl %eax, 4(%esp) -; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; CHECK-NEXT: fstps (%esp) -; CHECK-NEXT: calll sincosf -; CHECK-NEXT: leal 36(%esp), %eax -; CHECK-NEXT: movl %eax, 8(%esp) -; CHECK-NEXT: movl %edi, 4(%esp) -; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; CHECK-NEXT: fstps (%esp) -; CHECK-NEXT: calll sincosf -; CHECK-NEXT: leal 48(%esp), %eax -; CHECK-NEXT: movl %eax, 8(%esp) -; CHECK-NEXT: addl $12, %edi -; CHECK-NEXT: movl %edi, 4(%esp) -; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; CHECK-NEXT: fstps (%esp) -; CHECK-NEXT: calll sincosf -; CHECK-NEXT: flds 36(%esp) -; CHECK-NEXT: flds 40(%esp) -; CHECK-NEXT: flds 44(%esp) -; CHECK-NEXT: flds 48(%esp) -; CHECK-NEXT: fstps 12(%esi) -; CHECK-NEXT: fstps 8(%esi) -; CHECK-NEXT: fstps 4(%esi) -; CHECK-NEXT: fstps (%esi) -; CHECK-NEXT: addl $52, %esp -; CHECK-NEXT: popl %esi -; CHECK-NEXT: popl %edi -; CHECK-NEXT: retl +; X86-LABEL: test_sincos_v4f32: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $52, %esp +; X86-NEXT: movl 84(%esp), %esi +; X86-NEXT: flds 76(%esp) +; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: flds 64(%esp) +; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: flds 72(%esp) +; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: flds 68(%esp) +; X86-NEXT: movl 80(%esp), %edi +; X86-NEXT: leal 40(%esp), %eax +; X86-NEXT: movl %eax, 8(%esp) +; X86-NEXT: leal 4(%edi), %eax +; X86-NEXT: movl %eax, 4(%esp) +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sincosf +; X86-NEXT: leal 44(%esp), %eax +; X86-NEXT: movl %eax, 8(%esp) +; X86-NEXT: leal 8(%edi), %eax +; X86-NEXT: movl %eax, 4(%esp) +; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sincosf +; X86-NEXT: leal 36(%esp), %eax +; X86-NEXT: movl %eax, 8(%esp) +; X86-NEXT: movl %edi, 4(%esp) +; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sincosf +; X86-NEXT: leal 48(%esp), %eax +; X86-NEXT: movl %eax, 8(%esp) +; X86-NEXT: addl $12, %edi +; X86-NEXT: movl %edi, 4(%esp) +; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: fstps (%esp) +; X86-NEXT: calll sincosf +; X86-NEXT: flds 36(%esp) +; X86-NEXT: flds 40(%esp) +; X86-NEXT: flds 44(%esp) +; X86-NEXT: flds 48(%esp) +; X86-NEXT: fstps 12(%esi) +; X86-NEXT: fstps 8(%esi) +; X86-NEXT: fstps 4(%esi) +; X86-NEXT: fstps (%esi) +; X86-NEXT: addl $52, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: test_sincos_v4f32: +; X64: # %bb.0: +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $56, %rsp +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-NEXT: leaq 4(%rsp), %rdi +; X64-NEXT: movq %rsp, %rsi +; X64-NEXT: callq sincosf@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: leaq 12(%rsp), %rdi +; X64-NEXT: leaq 8(%rsp), %rsi +; X64-NEXT: callq sincosf@PLT +; X64-NEXT: leaq 28(%rsp), %rdi +; X64-NEXT: leaq 24(%rsp), %rsi +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: callq sincosf@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: leaq 20(%rsp), %rdi +; X64-NEXT: leaq 16(%rsp), %rsi +; X64-NEXT: callq sincosf@PLT +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-NEXT: movups %xmm1, (%r14) +; X64-NEXT: movups %xmm0, (%rbx) +; X64-NEXT: addq $56, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r14 +; X64-NEXT: retq +; +; MACOS-SINCOS-STRET-LABEL: test_sincos_v4f32: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: pushq %r14 +; MACOS-SINCOS-STRET-NEXT: pushq %rbx +; MACOS-SINCOS-STRET-NEXT: subq $104, %rsp +; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-SINCOS-STRET-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, %xmm1 +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; MACOS-SINCOS-STRET-NEXT: unpcklpd (%rsp), %xmm2 ## 16-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: ## xmm2 = xmm2[0],mem[0] +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14) +; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx) +; MACOS-SINCOS-STRET-NEXT: addq $104, %rsp +; MACOS-SINCOS-STRET-NEXT: popq %rbx +; MACOS-SINCOS-STRET-NEXT: popq %r14 +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v4f32: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: pushq %r14 +; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx +; MACOS-NOSINCOS-STRET-NEXT: subq $104, %rsp +; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cosf +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sinf +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0] +; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14) +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx) +; MACOS-NOSINCOS-STRET-NEXT: addq $104, %rsp +; MACOS-NOSINCOS-STRET-NEXT: popq %rbx +; MACOS-NOSINCOS-STRET-NEXT: popq %r14 +; MACOS-NOSINCOS-STRET-NEXT: retq %result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x) %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0 %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1 @@ -63,36 +217,120 @@ define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias } define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind { -; CHECK-LABEL: test_sincos_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: pushl %edi -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $52, %esp -; CHECK-NEXT: movl 84(%esp), %esi -; CHECK-NEXT: fldl 72(%esp) -; CHECK-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill -; CHECK-NEXT: fldl 64(%esp) -; CHECK-NEXT: movl 80(%esp), %edi -; CHECK-NEXT: leal 24(%esp), %eax -; CHECK-NEXT: movl %eax, 12(%esp) -; CHECK-NEXT: movl %edi, 8(%esp) -; CHECK-NEXT: fstpl (%esp) -; CHECK-NEXT: calll sincos -; CHECK-NEXT: leal 32(%esp), %eax -; CHECK-NEXT: movl %eax, 12(%esp) -; CHECK-NEXT: addl $8, %edi -; CHECK-NEXT: movl %edi, 8(%esp) -; CHECK-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload -; CHECK-NEXT: fstpl (%esp) -; CHECK-NEXT: calll sincos -; CHECK-NEXT: fldl 24(%esp) -; CHECK-NEXT: fldl 32(%esp) -; CHECK-NEXT: fstpl 8(%esi) -; CHECK-NEXT: fstpl (%esi) -; CHECK-NEXT: addl $52, %esp -; CHECK-NEXT: popl %esi -; CHECK-NEXT: popl %edi -; CHECK-NEXT: retl +; X86-LABEL: test_sincos_v2f64: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $52, %esp +; X86-NEXT: movl 84(%esp), %esi +; X86-NEXT: fldl 72(%esp) +; X86-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill +; X86-NEXT: fldl 64(%esp) +; X86-NEXT: movl 80(%esp), %edi +; X86-NEXT: leal 24(%esp), %eax +; X86-NEXT: movl %eax, 12(%esp) +; X86-NEXT: movl %edi, 8(%esp) +; X86-NEXT: fstpl (%esp) +; X86-NEXT: calll sincos +; X86-NEXT: leal 32(%esp), %eax +; X86-NEXT: movl %eax, 12(%esp) +; X86-NEXT: addl $8, %edi +; X86-NEXT: movl %edi, 8(%esp) +; X86-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload +; X86-NEXT: fstpl (%esp) +; X86-NEXT: calll sincos +; X86-NEXT: fldl 24(%esp) +; X86-NEXT: fldl 32(%esp) +; X86-NEXT: fstpl 8(%esi) +; X86-NEXT: fstpl (%esi) +; X86-NEXT: addl $52, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: test_sincos_v2f64: +; X64: # %bb.0: +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $56, %rsp +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: leaq 24(%rsp), %rdi +; X64-NEXT: leaq 16(%rsp), %rsi +; X64-NEXT: callq sincos@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: leaq 8(%rsp), %rdi +; X64-NEXT: movq %rsp, %rsi +; X64-NEXT: callq sincos@PLT +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; X64-NEXT: movups %xmm1, (%r14) +; X64-NEXT: movups %xmm0, (%rbx) +; X64-NEXT: addq $56, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r14 +; X64-NEXT: retq +; +; MACOS-SINCOS-STRET-LABEL: test_sincos_v2f64: +; MACOS-SINCOS-STRET: ## %bb.0: +; MACOS-SINCOS-STRET-NEXT: pushq %r14 +; MACOS-SINCOS-STRET-NEXT: pushq %rbx +; MACOS-SINCOS-STRET-NEXT: subq $56, %rsp +; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret +; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14) +; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx) +; MACOS-SINCOS-STRET-NEXT: addq $56, %rsp +; MACOS-SINCOS-STRET-NEXT: popq %rbx +; MACOS-SINCOS-STRET-NEXT: popq %r14 +; MACOS-SINCOS-STRET-NEXT: retq +; +; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v2f64: +; MACOS-NOSINCOS-STRET: ## %bb.0: +; MACOS-NOSINCOS-STRET-NEXT: pushq %r14 +; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx +; MACOS-NOSINCOS-STRET-NEXT: subq $56, %rsp +; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx +; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14 +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cos +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: callq _cos +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sin +; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: callq _sin +; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14) +; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx) +; MACOS-NOSINCOS-STRET-NEXT: addq $56, %rsp +; MACOS-NOSINCOS-STRET-NEXT: popq %rbx +; MACOS-NOSINCOS-STRET-NEXT: popq %r14 +; MACOS-NOSINCOS-STRET-NEXT: retq %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x) %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0 %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1 diff --git a/llvm/test/CodeGen/X86/llvm.sincospi.ll b/llvm/test/CodeGen/X86/llvm.sincospi.ll new file mode 100644 index 0000000..5546c66 --- /dev/null +++ b/llvm/test/CodeGen/X86/llvm.sincospi.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=x86_64-apple-macosx10.9 < %s | FileCheck %s + +define { half, half } @test_sincospi_f16(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %result = call { half, half } @llvm.sincospi.f16(half %a) + ret { half, half } %result +} + +define half @test_sincospi_f16_only_use_sin(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16_only_use_sin: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: movq %rsp, %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %result = call { half, half } @llvm.sincospi.f16(half %a) + %result.0 = extractvalue { half, half } %result, 0 + ret half %result.0 +} + +define half @test_sincospi_f16_only_use_cos(half %a) #0 { +; CHECK-LABEL: test_sincospi_f16_only_use_cos: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %result = call { half, half } @llvm.sincospi.f16(half %a) + %result.1 = extractvalue { half, half } %result, 1 + ret half %result.1 +} + +define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: pextrw $0, %xmm0, %ebx +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movzwl %ax, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: callq ___extendhfsf2 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: callq ___truncsfhf2 +; CHECK-NEXT: ## kill: def $ax killed $ax def $eax +; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; CHECK-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; CHECK-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; CHECK-NEXT: addq $64, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a) + ret { <2 x half>, <2 x half> } %result +} + +define { float, float } @test_sincospi_f32(float %a) #0 { +; CHECK-LABEL: test_sincospi_f32: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %result = call { float, float } @llvm.sincospi.f32(float %a) + ret { float, float } %result +} + +define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f32: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a) + ret { <2 x float>, <2 x float> } %result +} + +define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 { +; CHECK-LABEL: test_sincospi_v3f32: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospif +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: retq + %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a) + ret { <3 x float>, <3 x float> } %result +} + +define { double, double } @test_sincospi_f64(double %a) #0 { +; CHECK-LABEL: test_sincospi_f64: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospi +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: retq + %result = call { double, double } @llvm.sincospi.f64(double %a) + ret { double, double } %result +} + +define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_sincospi_v2f64: +; CHECK: ## %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: callq ___sincospi +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq %rsp, %rsi +; CHECK-NEXT: callq ___sincospi +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: retq + %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a) + ret { <2 x double>, <2 x double> } %result +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll index ebae51f..0800373 100644 --- a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll +++ b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll @@ -16,11 +16,11 @@ define void @foo(i32 %N) nounwind { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: # %bb ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movw %dx, X -; CHECK-NEXT: movw %cx, Y -; CHECK-NEXT: incl %edx -; CHECK-NEXT: addl $4, %ecx -; CHECK-NEXT: cmpl %edx, %eax +; CHECK-NEXT: movw %cx, X +; CHECK-NEXT: movw %dx, Y +; CHECK-NEXT: incl %ecx +; CHECK-NEXT: addl $4, %edx +; CHECK-NEXT: cmpl %ecx, %eax ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: .LBB0_3: # %return ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 2a2a4a5..209ee79 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -1480,15 +1480,15 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB10_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1728,10 +1728,10 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado ; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm7, %ymm4, %ymm4 ; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3 @@ -1739,9 +1739,9 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1765,15 +1765,15 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2 -; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: jne .LBB11_1 ; AVX512-NEXT: # %bb.2: # %middle.block -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index caec02e..58adbb7 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -207,15 +207,15 @@ declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , < ; SCALAR-NEXT: store i32 %Elt2, ptr %Ptr23, align 4 define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) { -; X64-LABEL: test6: -; X64: # %bb.0: -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} -; X64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} -; X64-NEXT: vmovdqa %ymm2, %ymm0 -; X64-NEXT: retq +; X64-KNL-LABEL: test6: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; X64-KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0 +; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test6: ; X86-KNL: # %bb.0: @@ -230,11 +230,21 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) { ; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0 ; X86-KNL-NEXT: retl ; +; X64-SKX-LABEL: test6: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; X64-SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; X64-SKX-NEXT: vmovdqa %ymm2, %ymm0 +; X64-SKX-NEXT: retq +; ; X86-SKX-LABEL: test6: ; X86-SKX: # %bb.0: -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k2 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k2 ; X86-SKX-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2} ; X86-SKX-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1} ; X86-SKX-NEXT: vmovdqa %ymm2, %ymm0 @@ -255,9 +265,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} -; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} -; X64-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X64-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; X64-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test7: @@ -271,9 +281,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-KNL-NEXT: kmovw %k1, %k2 ; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} -; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} -; X86-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} +; X86-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X86-KNL-NEXT: retl ; ; X64-SKX-LABEL: test7: @@ -397,7 +407,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-SMALL-NEXT: retq @@ -412,7 +422,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-LARGE-NEXT: retq @@ -424,7 +434,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -481,7 +491,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-SMALL-NEXT: retq @@ -496,7 +506,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; X64-SKX-LARGE-NEXT: retq @@ -508,7 +518,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { ; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 ; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -2465,17 +2475,17 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>) define <16 x ptr> @test31(<16 x ptr> %ptrs) { -; X64-LABEL: test31: -; X64: # %bb.0: -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} -; X64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 -; X64-NEXT: vmovdqa64 %zmm2, %zmm1 -; X64-NEXT: retq +; X64-KNL-LABEL: test31: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} +; X64-KNL-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} +; X64-KNL-NEXT: vmovdqa64 %zmm3, %zmm0 +; X64-KNL-NEXT: vmovdqa64 %zmm2, %zmm1 +; X64-KNL-NEXT: retq ; ; X86-LABEL: test31: ; X86: # %bb.0: @@ -2484,6 +2494,18 @@ define <16 x ptr> @test31(<16 x ptr> %ptrs) { ; X86-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl +; +; X64-SKX-LABEL: test31: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} +; X64-SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} +; X64-SKX-NEXT: vmovdqa64 %zmm3, %zmm0 +; X64-SKX-NEXT: vmovdqa64 %zmm2, %zmm1 +; X64-SKX-NEXT: retq %res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x ptr> undef) ret <16 x ptr>%res } @@ -3253,17 +3275,17 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) { ; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0 ; X64-KNL-NEXT: retq ; -; X86-LABEL: test_global_array: -; X86: # %bb.0: -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} -; X86-NEXT: vmovdqa %ymm1, %ymm0 -; X86-NEXT: retl +; X86-KNL-LABEL: test_global_array: +; X86-KNL: # %bb.0: +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0 +; X86-KNL-NEXT: retl ; ; X64-SKX-SMALL-LABEL: test_global_array: ; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0 @@ -3272,11 +3294,19 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) { ; X64-SKX-LARGE-LABEL: test_global_array: ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0 ; X64-SKX-LARGE-NEXT: retq +; +; X86-SKX-LABEL: test_global_array: +; X86-SKX: # %bb.0: +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0 +; X86-SKX-NEXT: retl %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) ret <8 x i32> %g @@ -3291,17 +3321,17 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) { ; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0 ; X64-KNL-NEXT: retq ; -; X86-LABEL: test_global_array_zeroinitializer_index: -; X86: # %bb.0: -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} -; X86-NEXT: vmovdqa %ymm1, %ymm0 -; X86-NEXT: retl +; X86-KNL-LABEL: test_global_array_zeroinitializer_index: +; X86-KNL: # %bb.0: +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0 +; X86-KNL-NEXT: retl ; ; X64-SKX-SMALL-LABEL: test_global_array_zeroinitializer_index: ; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0 @@ -3310,11 +3340,19 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) { ; X64-SKX-LARGE-LABEL: test_global_array_zeroinitializer_index: ; X64-SKX-LARGE: # %bb.0: ; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax -; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0 ; X64-SKX-LARGE-NEXT: retq +; +; X86-SKX-LABEL: test_global_array_zeroinitializer_index: +; X86-SKX: # %bb.0: +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} +; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0 +; X86-SKX-NEXT: retl %p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) ret <8 x i32> %g @@ -3545,7 +3583,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X64-SKX-LABEL: sext_v8i8_index: ; X64-SKX: # %bb.0: ; X64-SKX-NEXT: vpmovsxbd %xmm0, %ymm1 -; X64-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; X64-SKX-NEXT: retq @@ -3554,7 +3592,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: vpmovsxbd %xmm0, %ymm1 -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -3617,7 +3655,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X64-SKX-LABEL: zext_v8i8_index: ; X64-SKX: # %bb.0: ; X64-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; X64-SKX-NEXT: retq @@ -3626,7 +3664,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) { ; X86-SKX: # %bb.0: ; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-SKX-NEXT: kxnorw %k0, %k0, %k1 +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 ; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; X86-SKX-NEXT: retl @@ -4793,19 +4831,19 @@ define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) { } define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { -; X64-LABEL: pr163023_zext: -; X64: # %bb.0: -; X64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; X64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; X64-NEXT: kxnorw %k0, %k0, %k2 -; X64-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} -; X64-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} -; X64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 -; X64-NEXT: retq +; X64-KNL-LABEL: pr163023_zext: +; X64-KNL: # %bb.0: +; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-KNL-NEXT: kxnorw %k0, %k0, %k2 +; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} +; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} +; X64-KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; X64-KNL-NEXT: retq ; ; X86-LABEL: pr163023_zext: ; X86: # %bb.0: @@ -4815,6 +4853,20 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { ; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1} ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl +; +; X64-SKX-LABEL: pr163023_zext: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-SKX-NEXT: kxnorb %k0, %k0, %k2 +; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} +; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} +; X64-SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; X64-SKX-NEXT: retq %addr.p = ptrtoint ptr %a0 to i64 %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0 %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer @@ -4834,21 +4886,37 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { %struct.foo = type { ptr, i64, i16, i16, i32 } define <8 x i64> @pr45906(<8 x ptr> %ptr) { -; X64-LABEL: pr45906: -; X64: # %bb.0: # %bb -; X64-NEXT: kxnorw %k0, %k0, %k1 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} -; X64-NEXT: vmovdqa64 %zmm1, %zmm0 -; X64-NEXT: retq +; X64-KNL-LABEL: pr45906: +; X64-KNL: # %bb.0: # %bb +; X64-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-KNL-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-KNL-NEXT: retq ; -; X86-LABEL: pr45906: -; X86: # %bb.0: # %bb -; X86-NEXT: kxnorw %k0, %k0, %k1 -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 -; X86-NEXT: retl +; X86-KNL-LABEL: pr45906: +; X86-KNL: # %bb.0: # %bb +; X86-KNL-NEXT: kxnorw %k0, %k0, %k1 +; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-KNL-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} +; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm0 +; X86-KNL-NEXT: retl +; +; X64-SKX-LABEL: pr45906: +; X64-SKX: # %bb.0: # %bb +; X64-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-SKX-NEXT: retq +; +; X86-SKX-LABEL: pr45906: +; X86-SKX: # %bb.0: # %bb +; X86-SKX-NEXT: kxnorb %k0, %k0, %k1 +; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-SKX-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} +; X86-SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; X86-SKX-NEXT: retl bb: %tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1 %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index 18d394e..57b0577 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -4,9 +4,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512FVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512FVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512BWVL define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: @@ -350,14 +350,21 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: truncstore_v8i64_v8i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512FVL-LABEL: truncstore_v8i64_v8i32: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k1 +; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512FVL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; +; AVX512BWVL-LABEL: truncstore_v8i64_v8i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 +; AVX512BWVL-NEXT: vpmovsqd %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer %b = icmp slt <8 x i64> %x, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> %c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> @@ -964,9 +971,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1572,9 +1577,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1788,14 +1791,21 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: truncstore_v4i64_v4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512FVL-LABEL: truncstore_v4i64_v4i32: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512FVL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; +; AVX512BWVL-LABEL: truncstore_v4i64_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512BWVL-NEXT: vpmovsqd %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i64> %x, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> %c = select <4 x i1> %b, <4 x i64> %x, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> @@ -2141,9 +2151,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2495,9 +2503,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2641,13 +2647,19 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: truncstore_v2i64_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} -; AVX512VL-NEXT: retq +; AVX512FVL-LABEL: truncstore_v2i64_v2i32: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k1 +; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512FVL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} +; AVX512FVL-NEXT: retq +; +; AVX512BWVL-LABEL: truncstore_v2i64_v2i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 +; AVX512BWVL-NEXT: vpmovsqd %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, <i64 2147483647, i64 2147483647> %c = select <2 x i1> %b, <2 x i64> %x, <2 x i64> <i64 2147483647, i64 2147483647> @@ -2832,9 +2844,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, <i64 32767, i64 32767> @@ -3018,9 +3028,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, <i64 127, i64 127> @@ -3816,9 +3824,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4594,9 +4600,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -5034,9 +5038,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5473,9 +5475,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5686,9 +5686,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v4i32_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i32> %x, <i32 32767, i32 32767, i32 32767, i32 32767> @@ -5904,9 +5902,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v4i32_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i32> %x, <i32 127, i32 127, i32 127, i32 127> @@ -7332,9 +7328,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -8083,9 +8077,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer @@ -8445,9 +8437,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer %b = icmp slt <8 x i16> %x, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 4c4b6e7..0386d95 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -4,9 +4,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512FVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512FVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512BWVL define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: @@ -281,13 +281,20 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: truncstore_v8i64_v8i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512FVL-LABEL: truncstore_v8i64_v8i32: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k1 +; AVX512FVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512FVL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; +; AVX512BWVL-LABEL: truncstore_v8i64_v8i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 +; AVX512BWVL-NEXT: vpmovusqd %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer %b = icmp ult <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> %c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> @@ -829,8 +836,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1367,8 +1373,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1547,13 +1552,20 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: truncstore_v4i64_v4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512FVL-LABEL: truncstore_v4i64_v4i32: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512FVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512FVL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; +; AVX512BWVL-LABEL: truncstore_v4i64_v4i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512BWVL-NEXT: vpmovusqd %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> %c = select <4 x i1> %b, <4 x i64> %x, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> @@ -1868,8 +1880,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2188,8 +2199,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2304,12 +2314,18 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: truncstore_v2i64_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} -; AVX512VL-NEXT: retq +; AVX512FVL-LABEL: truncstore_v2i64_v2i32: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k1 +; AVX512FVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512FVL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} +; AVX512FVL-NEXT: retq +; +; AVX512BWVL-LABEL: truncstore_v2i64_v2i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 +; AVX512BWVL-NEXT: vpmovusqd %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, <i64 4294967295, i64 4294967295> %c = select <2 x i1> %b, <2 x i64> %x, <2 x i64> <i64 4294967295, i64 4294967295> @@ -2470,8 +2486,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, <i64 65535, i64 65535> @@ -2630,8 +2645,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, <i64 255, i64 255> @@ -3457,8 +3471,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4273,8 +4286,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4737,8 +4749,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5194,8 +5205,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5455,8 +5465,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v4i32_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535> @@ -5717,8 +5726,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512BWVL-LABEL: truncstore_v4i32_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> @@ -7171,8 +7179,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -7935,8 +7942,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer @@ -8302,8 +8308,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer %b = icmp ult <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 388d852..f38b769 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -368,46 +368,47 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { ; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9 ; AVX512F-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX512F-NEXT: vmulss %xmm1, %xmm4, %xmm10 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3] -; AVX512F-NEXT: vmulss %xmm6, %xmm5, %xmm6 -; AVX512F-NEXT: vaddss %xmm6, %xmm10, %xmm6 -; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10 -; AVX512F-NEXT: vmulss %xmm8, %xmm10, %xmm8 -; AVX512F-NEXT: vaddss %xmm6, %xmm8, %xmm6 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3] +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm5[1,1,3,3] +; AVX512F-NEXT: vmulss %xmm6, %xmm11, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm10, %xmm5 +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512F-NEXT: vmulss %xmm6, %xmm8, %xmm8 +; AVX512F-NEXT: vaddss %xmm5, %xmm8, %xmm5 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm5[0],xmm9[3] ; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm8 ; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX512F-NEXT: vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2] -; AVX512F-NEXT: vmulps %xmm2, %xmm11, %xmm11 -; AVX512F-NEXT: vaddps %xmm11, %xmm8, %xmm8 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3] -; AVX512F-NEXT: vmulps %xmm3, %xmm11, %xmm12 +; AVX512F-NEXT: vmovsldup {{.*#+}} xmm10 = xmm9[0,0,2,2] +; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm10 +; AVX512F-NEXT: vaddps %xmm10, %xmm8, %xmm8 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm9[1,1,3,3] +; AVX512F-NEXT: vmulps %xmm3, %xmm10, %xmm12 ; AVX512F-NEXT: vaddps %xmm12, %xmm8, %xmm8 ; AVX512F-NEXT: vmulss %xmm7, %xmm4, %xmm7 -; AVX512F-NEXT: vmulss %xmm5, %xmm9, %xmm12 +; AVX512F-NEXT: vmulss %xmm9, %xmm11, %xmm12 ; AVX512F-NEXT: vaddss %xmm7, %xmm12, %xmm7 -; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11 -; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3] -; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3] -; AVX512F-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512F-NEXT: vmulss %xmm6, %xmm10, %xmm10 +; AVX512F-NEXT: vaddss %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm12 = xmm9[1,0] ; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2] ; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0 -; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2 +; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm1 ; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 ; AVX512F-NEXT: vmulps %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmulss %xmm4, %xmm11, %xmm2 -; AVX512F-NEXT: vmulss %xmm5, %xmm8, %xmm3 +; AVX512F-NEXT: vmulss %xmm4, %xmm12, %xmm2 +; AVX512F-NEXT: vmulss %xmm10, %xmm11, %xmm3 ; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vmulss %xmm1, %xmm10, %xmm1 +; AVX512F-NEXT: vmulss %xmm1, %xmm6, %xmm1 ; AVX512F-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm2 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm8[1,1,3,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1,2],xmm8[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_mul3x3_f32: @@ -447,26 +448,27 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { ; AVX512VL-NEXT: vaddss %xmm7, %xmm12, %xmm7 ; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11 ; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] -; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm8[1,0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm11 = xmm8[3,3,3,3] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm12 = xmm8[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2 +; AVX512VL-NEXT: vmulps %xmm2, %xmm11, %xmm2 ; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm1 ; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 ; AVX512VL-NEXT: vmulps %xmm2, %xmm6, %xmm2 ; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm2 -; AVX512VL-NEXT: vmulss %xmm7, %xmm4, %xmm4 +; AVX512VL-NEXT: vmulss %xmm12, %xmm9, %xmm2 +; AVX512VL-NEXT: vmulss %xmm4, %xmm11, %xmm4 ; AVX512VL-NEXT: vaddss %xmm4, %xmm2, %xmm2 ; AVX512VL-NEXT: vmulss %xmm1, %xmm10, %xmm1 ; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2 -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3] +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm5[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq entry: %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 0, i32 1> diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 595f849..d8be4cf 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 ; ; 32-bit SSE tests to make sure we do reasonable things. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1 @@ -353,6 +353,39 @@ define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp ret <4 x float> %res3 } +define <4 x float> @merge_v4f32_f32_3210(ptr %ptr) nounwind uwtable noinline ssp { +; SSE-LABEL: merge_v4f32_f32_3210: +; SSE: # %bb.0: +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: merge_v4f32_f32_3210: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; AVX-NEXT: retq +; +; X86-SSE-LABEL: merge_v4f32_f32_3210: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups (%eax), %xmm0 +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X86-SSE-NEXT: retl + %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3 + %ptr1 = getelementptr inbounds float, ptr %ptr, i64 2 + %ptr2 = getelementptr inbounds float, ptr %ptr, i64 1 + %ptr3 = getelementptr inbounds float, ptr %ptr, i64 0 + %val0 = load float, ptr %ptr0, align 4 + %val1 = load float, ptr %ptr1, align 4 + %val2 = load float, ptr %ptr2, align 4 + %val3 = load float, ptr %ptr3, align 4 + %res0 = insertelement <4 x float> poison, float %val0, i64 0 + %res1 = insertelement <4 x float> %res0, float %val1, i64 1 + %res2 = insertelement <4 x float> %res1, float %val2, i64 2 + %res3 = insertelement <4 x float> %res2, float %val3, i64 3 + ret <4 x float> %res3 +} + define <4 x i32> @merge_4i32_i32_23u5(ptr %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_4i32_i32_23u5: ; SSE: # %bb.0: @@ -724,6 +757,63 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline s ret <4 x i32> %res1 } +define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp { +; SSE-LABEL: merge_v4i32_i32_3210: +; SSE: # %bb.0: +; SSE-NEXT: movdqu (%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: merge_v4i32_i32_3210: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; AVX-NEXT: retq +; +; X86-SSE1-LABEL: merge_v4i32_i32_3210: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %edi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: .cfi_offset %esi, -12 +; X86-SSE1-NEXT: .cfi_offset %edi, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl 12(%ecx), %edx +; X86-SSE1-NEXT: movl 8(%ecx), %esi +; X86-SSE1-NEXT: movl (%ecx), %edi +; X86-SSE1-NEXT: movl 4(%ecx), %ecx +; X86-SSE1-NEXT: movl %edi, 12(%eax) +; X86-SSE1-NEXT: movl %ecx, 8(%eax) +; X86-SSE1-NEXT: movl %esi, 4(%eax) +; X86-SSE1-NEXT: movl %edx, (%eax) +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %edi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl $4 +; +; X86-SSE41-LABEL: merge_v4i32_i32_3210: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X86-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3 + %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2 + %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 1 + %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 0 + %val0 = load i32, ptr %ptr0, align 4 + %val1 = load i32, ptr %ptr1, align 4 + %val2 = load i32, ptr %ptr2, align 4 + %val3 = load i32, ptr %ptr3, align 4 + %res0 = insertelement <4 x i32> poison, i32 %val0, i64 0 + %res1 = insertelement <4 x i32> %res0, i32 %val1, i64 1 + %res2 = insertelement <4 x i32> %res1, i32 %val2, i64 2 + %res3 = insertelement <4 x i32> %res2, i32 %val3, i64 3 + ret <4 x i32> %res3 +} + define <8 x i16> @merge_8i16_i16_23u567u9(ptr %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_8i16_i16_23u567u9: ; SSE: # %bb.0: @@ -862,6 +952,110 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ss ret <8 x i16> %res7 } +define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; SSE2-LABEL: merge_8i16_i16_76543210: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: retq +; +; SSE41-LABEL: merge_8i16_i16_76543210: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqu (%rdi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: merge_8i16_i16_76543210: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; AVX-NEXT: retq +; +; X86-SSE1-LABEL: merge_8i16_i16_76543210: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: pushl %edi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 20 +; X86-SSE1-NEXT: pushl %eax +; X86-SSE1-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE1-NEXT: .cfi_offset %esi, -20 +; X86-SSE1-NEXT: .cfi_offset %edi, -16 +; X86-SSE1-NEXT: .cfi_offset %ebx, -12 +; X86-SSE1-NEXT: .cfi_offset %ebp, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movzwl 14(%eax), %ecx +; X86-SSE1-NEXT: movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; X86-SSE1-NEXT: movzwl 12(%eax), %ecx +; X86-SSE1-NEXT: movw %cx, (%esp) # 2-byte Spill +; X86-SSE1-NEXT: movzwl 10(%eax), %esi +; X86-SSE1-NEXT: movzwl 8(%eax), %edi +; X86-SSE1-NEXT: movzwl 6(%eax), %ebx +; X86-SSE1-NEXT: movzwl 4(%eax), %ebp +; X86-SSE1-NEXT: movzwl (%eax), %ecx +; X86-SSE1-NEXT: movzwl 2(%eax), %edx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movw %cx, 14(%eax) +; X86-SSE1-NEXT: movw %dx, 12(%eax) +; X86-SSE1-NEXT: movw %bp, 10(%eax) +; X86-SSE1-NEXT: movw %bx, 8(%eax) +; X86-SSE1-NEXT: movw %di, 6(%eax) +; X86-SSE1-NEXT: movw %si, 4(%eax) +; X86-SSE1-NEXT: movzwl (%esp), %ecx # 2-byte Folded Reload +; X86-SSE1-NEXT: movw %cx, 2(%eax) +; X86-SSE1-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 2-byte Folded Reload +; X86-SSE1-NEXT: movw %cx, (%eax) +; X86-SSE1-NEXT: addl $4, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 20 +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE1-NEXT: popl %edi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %ebp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl $4 +; +; X86-SSE41-LABEL: merge_8i16_i16_76543210: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; X86-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds i16, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds i16, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 0 + %val0 = load i16, ptr %ptr0 + %val1 = load i16, ptr %ptr1 + %val2 = load i16, ptr %ptr2 + %val3 = load i16, ptr %ptr3 + %val4 = load i16, ptr %ptr4 + %val5 = load i16, ptr %ptr5 + %val6 = load i16, ptr %ptr6 + %val7 = load i16, ptr %ptr7 + %res0 = insertelement <8 x i16> poison, i16 %val0, i64 0 + %res1 = insertelement <8 x i16> %res0, i16 %val1, i64 1 + %res2 = insertelement <8 x i16> %res1, i16 %val2, i64 2 + %res3 = insertelement <8 x i16> %res2, i16 %val3, i64 3 + %res4 = insertelement <8 x i16> %res3, i16 %val4, i64 4 + %res5 = insertelement <8 x i16> %res4, i16 %val5, i64 5 + %res6 = insertelement <8 x i16> %res5, i16 %val6, i64 6 + %res7 = insertelement <8 x i16> %res6, i16 %val7, i64 7 + ret <8 x i16> %res7 +} + define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF: ; SSE: # %bb.0: @@ -1056,6 +1250,164 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noin ret <16 x i8> %resF } +define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { +; SSE2-LABEL: merge_16i8_i8_FEDCBA9876543210: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqu (%rdi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: merge_16i8_i8_FEDCBA9876543210: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX-NEXT: retq +; +; X86-SSE1-LABEL: merge_16i8_i8_FEDCBA9876543210: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: subl $12, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE1-NEXT: .cfi_offset %esi, -12 +; X86-SSE1-NEXT: .cfi_offset %ebx, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE1-NEXT: movzbl 15(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 14(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 13(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 12(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 11(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 10(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 9(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 8(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 7(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 6(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movb 5(%esi), %bh +; X86-SSE1-NEXT: movb 4(%esi), %bl +; X86-SSE1-NEXT: movb 3(%esi), %dh +; X86-SSE1-NEXT: movb 2(%esi), %ch +; X86-SSE1-NEXT: movb (%esi), %cl +; X86-SSE1-NEXT: movb 1(%esi), %dl +; X86-SSE1-NEXT: movb %cl, 15(%eax) +; X86-SSE1-NEXT: movb %dl, 14(%eax) +; X86-SSE1-NEXT: movb %ch, 13(%eax) +; X86-SSE1-NEXT: movb %dh, 12(%eax) +; X86-SSE1-NEXT: movb %bl, 11(%eax) +; X86-SSE1-NEXT: movb %bh, 10(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 9(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 8(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 7(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 6(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 5(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 4(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 3(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 2(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 1(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, (%eax) +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl $4 +; +; X86-SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 15 + %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 14 + %ptr2 = getelementptr inbounds i8, ptr %ptr, i64 13 + %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 12 + %ptr4 = getelementptr inbounds i8, ptr %ptr, i64 11 + %ptr5 = getelementptr inbounds i8, ptr %ptr, i64 10 + %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 9 + %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 8 + %ptr8 = getelementptr inbounds i8, ptr %ptr, i64 7 + %ptr9 = getelementptr inbounds i8, ptr %ptr, i64 6 + %ptrA = getelementptr inbounds i8, ptr %ptr, i64 5 + %ptrB = getelementptr inbounds i8, ptr %ptr, i64 4 + %ptrC = getelementptr inbounds i8, ptr %ptr, i64 3 + %ptrD = getelementptr inbounds i8, ptr %ptr, i64 2 + %ptrE = getelementptr inbounds i8, ptr %ptr, i64 1 + %ptrF = getelementptr inbounds i8, ptr %ptr, i64 0 + %val0 = load i8, ptr %ptr0 + %val1 = load i8, ptr %ptr1 + %val2 = load i8, ptr %ptr2 + %val3 = load i8, ptr %ptr3 + %val4 = load i8, ptr %ptr4 + %val5 = load i8, ptr %ptr5 + %val6 = load i8, ptr %ptr6 + %val7 = load i8, ptr %ptr7 + %val8 = load i8, ptr %ptr8 + %val9 = load i8, ptr %ptr9 + %valA = load i8, ptr %ptrA + %valB = load i8, ptr %ptrB + %valC = load i8, ptr %ptrC + %valD = load i8, ptr %ptrD + %valE = load i8, ptr %ptrE + %valF = load i8, ptr %ptrF + %res0 = insertelement <16 x i8> poison, i8 %val0, i8 0 + %res1 = insertelement <16 x i8> %res0, i8 %val1, i64 1 + %res2 = insertelement <16 x i8> %res1, i8 %val2, i64 2 + %res3 = insertelement <16 x i8> %res2, i8 %val3, i64 3 + %res4 = insertelement <16 x i8> %res3, i8 %val4, i64 4 + %res5 = insertelement <16 x i8> %res4, i8 %val5, i64 5 + %res6 = insertelement <16 x i8> %res5, i8 %val6, i64 6 + %res7 = insertelement <16 x i8> %res6, i8 %val7, i64 7 + %res8 = insertelement <16 x i8> %res7, i8 %val8, i64 8 + %res9 = insertelement <16 x i8> %res8, i8 %val9, i64 9 + %resA = insertelement <16 x i8> %res9, i8 %valA, i64 10 + %resB = insertelement <16 x i8> %resA, i8 %valB, i64 11 + %resC = insertelement <16 x i8> %resB, i8 %valC, i64 12 + %resD = insertelement <16 x i8> %resC, i8 %valD, i64 13 + %resE = insertelement <16 x i8> %resD, i8 %valE, i64 14 + %resF = insertelement <16 x i8> %resE, i8 %valF, i64 15 + ret <16 x i8> %resF +} + define void @merge_4i32_i32_combine(ptr %dst, ptr %src) { ; SSE-LABEL: merge_4i32_i32_combine: ; SSE: # %bb.0: @@ -1285,3 +1637,90 @@ define <4 x i32> @load_i32_zext_i128_v4i32(ptr %ptr) { %3 = bitcast i128 %2 to <4 x i32> ret <4 x i32> %3 } + +; Don't attempt to reverse a partial VZEXT_LOAD +define <4 x i32> @no_reverse_vzload(ptr %p0) nounwind { +; SSE2-LABEL: no_reverse_vzload: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: no_reverse_vzload: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: paddd %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: no_reverse_vzload: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: no_reverse_vzload: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: no_reverse_vzload: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq +; +; X86-SSE1-LABEL: no_reverse_vzload: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE1-NEXT: xorl %ecx, %ecx +; X86-SSE1-NEXT: cmpl $0, (%edx) +; X86-SSE1-NEXT: setg %cl +; X86-SSE1-NEXT: negl %ecx +; X86-SSE1-NEXT: xorl %ebx, %ebx +; X86-SSE1-NEXT: cmpl $0, 4(%edx) +; X86-SSE1-NEXT: setg %bl +; X86-SSE1-NEXT: negl %ebx +; X86-SSE1-NEXT: movl %ebx, 4(%eax) +; X86-SSE1-NEXT: movl %ecx, (%eax) +; X86-SSE1-NEXT: movl $0, 12(%eax) +; X86-SSE1-NEXT: movl $0, 8(%eax) +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: retl $4 +; +; X86-SSE41-LABEL: no_reverse_vzload: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE41-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; X86-SSE41-NEXT: pxor %xmm2, %xmm2 +; X86-SSE41-NEXT: paddd %xmm1, %xmm1 +; X86-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X86-SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; X86-SSE41-NEXT: retl + %i0 = load <2 x i32>, ptr %p0, align 4 + %i1 = shufflevector <2 x i32> %i0, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> + %i2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %i1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %i3 = shl <4 x i32> %i2, <i32 4, i32 4, i32 1, i32 1> + %i4 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + %i5 = icmp slt <4 x i32> %i3, %i4 + %i6 = sext <4 x i1> %i5 to <4 x i32> + ret <4 x i32> %i6 +} diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll index 33e8d62..6ad306d 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -126,6 +126,44 @@ define <4 x double> @merge_4f64_f64_45zz(ptr %ptr) nounwind uwtable noinline ssp ret <4 x double> %res1 } +define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX1-LABEL: merge_v4f64_f64_3210: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_v4f64_f64_3210: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_v4f64_f64_3210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512F-NEXT: retq +; +; X86-AVX-LABEL: merge_v4f64_f64_3210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; X86-AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds double, ptr %ptr, i64 3 + %ptr1 = getelementptr inbounds double, ptr %ptr, i64 2 + %ptr2 = getelementptr inbounds double, ptr %ptr, i64 1 + %ptr3 = getelementptr inbounds double, ptr %ptr, i64 0 + %val0 = load double, ptr %ptr0, align 4 + %val1 = load double, ptr %ptr1, align 4 + %val2 = load double, ptr %ptr2, align 4 + %val3 = load double, ptr %ptr3, align 4 + %res0 = insertelement <4 x double> poison, double %val0, i64 0 + %res1 = insertelement <4 x double> %res0, double %val1, i64 1 + %res2 = insertelement <4 x double> %res1, double %val2, i64 2 + %res3 = insertelement <4 x double> %res2, double %val3, i64 3 + ret <4 x double> %res3 +} + define <4 x double> @merge_4f64_f64_34z6(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_4f64_f64_34z6: ; AVX: # %bb.0: @@ -234,6 +272,51 @@ define <4 x i64> @merge_4i64_i64_23zz(ptr %ptr) nounwind uwtable noinline ssp { ret <4 x i64> %res1 } +define <4 x i64> @merge_v4i64_i64_3210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX1-LABEL: merge_v4i64_i64_3210: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_v4i64_i64_3210: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_v4i64_i64_3210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512F-NEXT: retq +; +; X86-AVX-LABEL: merge_v4i64_i64_3210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpinsrd $1, 12(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpinsrd $1, 28(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrd $2, 16(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrd $3, 20(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 3 + %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2 + %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 1 + %ptr3 = getelementptr inbounds i64, ptr %ptr, i64 0 + %val0 = load i64, ptr %ptr0, align 4 + %val1 = load i64, ptr %ptr1, align 4 + %val2 = load i64, ptr %ptr2, align 4 + %val3 = load i64, ptr %ptr3, align 4 + %res0 = insertelement <4 x i64> poison, i64 %val0, i64 0 + %res1 = insertelement <4 x i64> %res0, i64 %val1, i64 1 + %res2 = insertelement <4 x i64> %res1, i64 %val2, i64 2 + %res3 = insertelement <4 x i64> %res2, i64 %val3, i64 3 + ret <4 x i64> %res3 +} + define <8 x float> @merge_8f32_2f32_23z5(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_8f32_2f32_23z5: ; AVX: # %bb.0: @@ -335,6 +418,58 @@ define <8 x float> @merge_8f32_f32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ret <8 x float> %res7 } +define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX1-LABEL: merge_8f32_f32_76543210: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8f32_f32_76543210: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8f32_f32_76543210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: retq +; +; X86-AVX-LABEL: merge_8f32_f32_76543210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; X86-AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds float, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds float, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds float, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds float, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds float, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds float, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds float, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds float, ptr %ptr, i64 0 + %val0 = load float, ptr %ptr0 + %val1 = load float, ptr %ptr1 + %val2 = load float, ptr %ptr2 + %val3 = load float, ptr %ptr3 + %val4 = load float, ptr %ptr4 + %val5 = load float, ptr %ptr5 + %val6 = load float, ptr %ptr6 + %val7 = load float, ptr %ptr7 + %res0 = insertelement <8 x float> poison, float %val0, i64 0 + %res1 = insertelement <8 x float> %res0, float %val1, i64 1 + %res2 = insertelement <8 x float> %res1, float %val2, i64 2 + %res3 = insertelement <8 x float> %res2, float %val3, i64 3 + %res4 = insertelement <8 x float> %res3, float %val4, i64 4 + %res5 = insertelement <8 x float> %res4, float %val5, i64 5 + %res6 = insertelement <8 x float> %res5, float %val6, i64 6 + %res7 = insertelement <8 x float> %res6, float %val7, i64 7 + ret <8 x float> %res7 +} + define <8 x i32> @merge_8i32_4i32_z3(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_8i32_4i32_z3: ; AVX: # %bb.0: @@ -414,6 +549,58 @@ define <8 x i32> @merge_8i32_i32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss ret <8 x i32> %res7 } +define <8 x i32> @merge_8i32_i32_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX1-LABEL: merge_8i32_i32_76543210: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8i32_i32_76543210: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8i32_i32_76543210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: retq +; +; X86-AVX-LABEL: merge_8i32_i32_76543210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; X86-AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds i32, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds i32, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds i32, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds i32, ptr %ptr, i64 0 + %val0 = load i32, ptr %ptr0 + %val1 = load i32, ptr %ptr1 + %val2 = load i32, ptr %ptr2 + %val3 = load i32, ptr %ptr3 + %val4 = load i32, ptr %ptr4 + %val5 = load i32, ptr %ptr5 + %val6 = load i32, ptr %ptr6 + %val7 = load i32, ptr %ptr7 + %res0 = insertelement <8 x i32> poison, i32 %val0, i64 0 + %res1 = insertelement <8 x i32> %res0, i32 %val1, i64 1 + %res2 = insertelement <8 x i32> %res1, i32 %val2, i64 2 + %res3 = insertelement <8 x i32> %res2, i32 %val3, i64 3 + %res4 = insertelement <8 x i32> %res3, i32 %val4, i64 4 + %res5 = insertelement <8 x i32> %res4, i32 %val5, i64 5 + %res6 = insertelement <8 x i32> %res5, i32 %val6, i64 6 + %res7 = insertelement <8 x i32> %res6, i32 %val7, i64 7 + ret <8 x i32> %res7 +} + define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: ; AVX: # %bb.0: @@ -522,6 +709,92 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n ret <16 x i16> %resF } +define <16 x i16> @merge_16i16_i16_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX1-LABEL: merge_16i16_i16_FEDCBA9876543210: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_16i16_i16_FEDCBA9876543210: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_16i16_i16_FEDCBA9876543210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: retq +; +; X86-AVX-LABEL: merge_16i16_i16_FEDCBA9876543210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovdqu (%eax), %xmm0 +; X86-AVX-NEXT: vmovdqu 16(%eax), %xmm1 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; X86-AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 15 + %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 14 + %ptr2 = getelementptr inbounds i16, ptr %ptr, i64 13 + %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 12 + %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 11 + %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 10 + %ptr6 = getelementptr inbounds i16, ptr %ptr, i64 9 + %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 8 + %ptr8 = getelementptr inbounds i16, ptr %ptr, i64 7 + %ptr9 = getelementptr inbounds i16, ptr %ptr, i64 6 + %ptrA = getelementptr inbounds i16, ptr %ptr, i64 5 + %ptrB = getelementptr inbounds i16, ptr %ptr, i64 4 + %ptrC = getelementptr inbounds i16, ptr %ptr, i64 3 + %ptrD = getelementptr inbounds i16, ptr %ptr, i64 2 + %ptrE = getelementptr inbounds i16, ptr %ptr, i64 1 + %ptrF = getelementptr inbounds i16, ptr %ptr, i64 0 + %val0 = load i16, ptr %ptr0 + %val1 = load i16, ptr %ptr1 + %val2 = load i16, ptr %ptr2 + %val3 = load i16, ptr %ptr3 + %val4 = load i16, ptr %ptr4 + %val5 = load i16, ptr %ptr5 + %val6 = load i16, ptr %ptr6 + %val7 = load i16, ptr %ptr7 + %val8 = load i16, ptr %ptr8 + %val9 = load i16, ptr %ptr9 + %valA = load i16, ptr %ptrA + %valB = load i16, ptr %ptrB + %valC = load i16, ptr %ptrC + %valD = load i16, ptr %ptrD + %valE = load i16, ptr %ptrE + %valF = load i16, ptr %ptrF + %res0 = insertelement <16 x i16> poison, i16 %val0, i64 0 + %res1 = insertelement <16 x i16> %res0, i16 %val1, i64 1 + %res2 = insertelement <16 x i16> %res1, i16 %val2, i64 2 + %res3 = insertelement <16 x i16> %res2, i16 %val3, i64 3 + %res4 = insertelement <16 x i16> %res3, i16 %val4, i64 4 + %res5 = insertelement <16 x i16> %res4, i16 %val5, i64 5 + %res6 = insertelement <16 x i16> %res5, i16 %val6, i64 6 + %res7 = insertelement <16 x i16> %res6, i16 %val7, i64 7 + %res8 = insertelement <16 x i16> %res7, i16 %val8, i64 8 + %res9 = insertelement <16 x i16> %res8, i16 %val9, i64 9 + %resA = insertelement <16 x i16> %res9, i16 %valA, i64 10 + %resB = insertelement <16 x i16> %resA, i16 %valB, i64 11 + %resC = insertelement <16 x i16> %resB, i16 %valC, i64 12 + %resD = insertelement <16 x i16> %resC, i16 %valD, i64 13 + %resE = insertelement <16 x i16> %resD, i16 %valE, i64 14 + %resF = insertelement <16 x i16> %resE, i16 %valF, i64 15 + ret <16 x i16> %resF +} + define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll index 790bed4..f9a0bd7 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -148,6 +148,46 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ret <8 x double> %res7 } +define <8 x double> @merge_8f64_f64_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; ALL-LABEL: merge_8f64_f64_76543210: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; ALL-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; ALL-NEXT: retq +; +; X86-AVX512F-LABEL: merge_8f64_f64_76543210: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; X86-AVX512F-NEXT: vpermpd (%eax), %zmm0, %zmm0 +; X86-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds double, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds double, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds double, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds double, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds double, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds double, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds double, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds double, ptr %ptr, i64 0 + %val0 = load double, ptr %ptr0 + %val1 = load double, ptr %ptr1 + %val2 = load double, ptr %ptr2 + %val3 = load double, ptr %ptr3 + %val4 = load double, ptr %ptr4 + %val5 = load double, ptr %ptr5 + %val6 = load double, ptr %ptr6 + %val7 = load double, ptr %ptr7 + %res0 = insertelement <8 x double> poison, double %val0, i64 0 + %res1 = insertelement <8 x double> %res0, double %val1, i64 1 + %res2 = insertelement <8 x double> %res1, double %val2, i64 2 + %res3 = insertelement <8 x double> %res2, double %val3, i64 3 + %res4 = insertelement <8 x double> %res3, double %val4, i64 4 + %res5 = insertelement <8 x double> %res4, double %val5, i64 5 + %res6 = insertelement <8 x double> %res5, double %val6, i64 6 + %res7 = insertelement <8 x double> %res6, double %val7, i64 7 + ret <8 x double> %res7 +} + define <8 x i64> @merge_8i64_4i64_z3(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_4i64_z3: ; ALL: # %bb.0: @@ -227,6 +267,63 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss ret <8 x i64> %res7 } +define <8 x i64> @merge_8i64_i64_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; ALL-LABEL: merge_8i64_i64_76543210: +; ALL: # %bb.0: +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; ALL-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; ALL-NEXT: retq +; +; X86-AVX512F-LABEL: merge_8i64_i64_76543210: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 12(%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 28(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $2, 16(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $3, 20(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 44(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $2, 32(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $3, 36(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 60(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vpinsrd $2, 48(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vpinsrd $3, 52(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds i64, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds i64, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds i64, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds i64, ptr %ptr, i64 0 + %val0 = load i64, ptr %ptr0 + %val1 = load i64, ptr %ptr1 + %val2 = load i64, ptr %ptr2 + %val3 = load i64, ptr %ptr3 + %val4 = load i64, ptr %ptr4 + %val5 = load i64, ptr %ptr5 + %val6 = load i64, ptr %ptr6 + %val7 = load i64, ptr %ptr7 + %res0 = insertelement <8 x i64> poison, i64 %val0, i64 0 + %res1 = insertelement <8 x i64> %res0, i64 %val1, i64 1 + %res2 = insertelement <8 x i64> %res1, i64 %val2, i64 2 + %res3 = insertelement <8 x i64> %res2, i64 %val3, i64 3 + %res4 = insertelement <8 x i64> %res3, i64 %val4, i64 4 + %res5 = insertelement <8 x i64> %res4, i64 %val5, i64 5 + %res6 = insertelement <8 x i64> %res5, i64 %val6, i64 6 + %res7 = insertelement <8 x i64> %res6, i64 %val7, i64 7 + ret <8 x i64> %res7 +} + define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: ; ALL: # %bb.0: @@ -335,6 +432,70 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable ret <16 x float> %resF } +define <16 x float> @merge_16f32_f32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { +; ALL-LABEL: merge_16f32_f32_FEDCBA9876543210: +; ALL: # %bb.0: +; ALL-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] +; ALL-NEXT: retq +; +; X86-AVX512F-LABEL: merge_16f32_f32_FEDCBA9876543210: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X86-AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] +; X86-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds float, ptr %ptr, i64 15 + %ptr1 = getelementptr inbounds float, ptr %ptr, i64 14 + %ptr2 = getelementptr inbounds float, ptr %ptr, i64 13 + %ptr3 = getelementptr inbounds float, ptr %ptr, i64 12 + %ptr4 = getelementptr inbounds float, ptr %ptr, i64 11 + %ptr5 = getelementptr inbounds float, ptr %ptr, i64 10 + %ptr6 = getelementptr inbounds float, ptr %ptr, i64 9 + %ptr7 = getelementptr inbounds float, ptr %ptr, i64 8 + %ptr8 = getelementptr inbounds float, ptr %ptr, i64 7 + %ptr9 = getelementptr inbounds float, ptr %ptr, i64 6 + %ptrA = getelementptr inbounds float, ptr %ptr, i64 5 + %ptrB = getelementptr inbounds float, ptr %ptr, i64 4 + %ptrC = getelementptr inbounds float, ptr %ptr, i64 3 + %ptrD = getelementptr inbounds float, ptr %ptr, i64 2 + %ptrE = getelementptr inbounds float, ptr %ptr, i64 1 + %ptrF = getelementptr inbounds float, ptr %ptr, i64 0 + %val0 = load float, ptr %ptr0 + %val1 = load float, ptr %ptr1 + %val2 = load float, ptr %ptr2 + %val3 = load float, ptr %ptr3 + %val4 = load float, ptr %ptr4 + %val5 = load float, ptr %ptr5 + %val6 = load float, ptr %ptr6 + %val7 = load float, ptr %ptr7 + %val8 = load float, ptr %ptr8 + %val9 = load float, ptr %ptr9 + %valA = load float, ptr %ptrA + %valB = load float, ptr %ptrB + %valC = load float, ptr %ptrC + %valD = load float, ptr %ptrD + %valE = load float, ptr %ptrE + %valF = load float, ptr %ptrF + %res0 = insertelement <16 x float> poison, float %val0, i64 0 + %res1 = insertelement <16 x float> %res0, float %val1, i64 1 + %res2 = insertelement <16 x float> %res1, float %val2, i64 2 + %res3 = insertelement <16 x float> %res2, float %val3, i64 3 + %res4 = insertelement <16 x float> %res3, float %val4, i64 4 + %res5 = insertelement <16 x float> %res4, float %val5, i64 5 + %res6 = insertelement <16 x float> %res5, float %val6, i64 6 + %res7 = insertelement <16 x float> %res6, float %val7, i64 7 + %res8 = insertelement <16 x float> %res7, float %val8, i64 8 + %res9 = insertelement <16 x float> %res8, float %val9, i64 9 + %resA = insertelement <16 x float> %res9, float %valA, i64 10 + %resB = insertelement <16 x float> %resA, float %valB, i64 11 + %resC = insertelement <16 x float> %resB, float %valC, i64 12 + %resD = insertelement <16 x float> %resC, float %valD, i64 13 + %resE = insertelement <16 x float> %resD, float %valE, i64 14 + %resF = insertelement <16 x float> %resE, float %valF, i64 15 + ret <16 x float> %resF +} + define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: ; ALL: # %bb.0: @@ -443,6 +604,70 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n ret <16 x i32> %resF } +define <16 x i32> @merge_16i32_i32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { +; ALL-LABEL: merge_16i32_i32_FEDCBA9876543210: +; ALL: # %bb.0: +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] +; ALL-NEXT: retq +; +; X86-AVX512F-LABEL: merge_16i32_i32_FEDCBA9876543210: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X86-AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] +; X86-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 15 + %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 14 + %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 13 + %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 12 + %ptr4 = getelementptr inbounds i32, ptr %ptr, i64 11 + %ptr5 = getelementptr inbounds i32, ptr %ptr, i64 10 + %ptr6 = getelementptr inbounds i32, ptr %ptr, i64 9 + %ptr7 = getelementptr inbounds i32, ptr %ptr, i64 8 + %ptr8 = getelementptr inbounds i32, ptr %ptr, i64 7 + %ptr9 = getelementptr inbounds i32, ptr %ptr, i64 6 + %ptrA = getelementptr inbounds i32, ptr %ptr, i64 5 + %ptrB = getelementptr inbounds i32, ptr %ptr, i64 4 + %ptrC = getelementptr inbounds i32, ptr %ptr, i64 3 + %ptrD = getelementptr inbounds i32, ptr %ptr, i64 2 + %ptrE = getelementptr inbounds i32, ptr %ptr, i64 1 + %ptrF = getelementptr inbounds i32, ptr %ptr, i64 0 + %val0 = load i32, ptr %ptr0 + %val1 = load i32, ptr %ptr1 + %val2 = load i32, ptr %ptr2 + %val3 = load i32, ptr %ptr3 + %val4 = load i32, ptr %ptr4 + %val5 = load i32, ptr %ptr5 + %val6 = load i32, ptr %ptr6 + %val7 = load i32, ptr %ptr7 + %val8 = load i32, ptr %ptr8 + %val9 = load i32, ptr %ptr9 + %valA = load i32, ptr %ptrA + %valB = load i32, ptr %ptrB + %valC = load i32, ptr %ptrC + %valD = load i32, ptr %ptrD + %valE = load i32, ptr %ptrE + %valF = load i32, ptr %ptrF + %res0 = insertelement <16 x i32> poison, i32 %val0, i64 0 + %res1 = insertelement <16 x i32> %res0, i32 %val1, i64 1 + %res2 = insertelement <16 x i32> %res1, i32 %val2, i64 2 + %res3 = insertelement <16 x i32> %res2, i32 %val3, i64 3 + %res4 = insertelement <16 x i32> %res3, i32 %val4, i64 4 + %res5 = insertelement <16 x i32> %res4, i32 %val5, i64 5 + %res6 = insertelement <16 x i32> %res5, i32 %val6, i64 6 + %res7 = insertelement <16 x i32> %res6, i32 %val7, i64 7 + %res8 = insertelement <16 x i32> %res7, i32 %val8, i64 8 + %res9 = insertelement <16 x i32> %res8, i32 %val9, i64 9 + %resA = insertelement <16 x i32> %res9, i32 %valA, i64 10 + %resB = insertelement <16 x i32> %resA, i32 %valB, i64 11 + %resC = insertelement <16 x i32> %resB, i32 %valC, i64 12 + %resD = insertelement <16 x i32> %resC, i32 %valD, i64 13 + %resE = insertelement <16 x i32> %resD, i32 %valE, i64 14 + %resF = insertelement <16 x i32> %resE, i32 %valF, i64 15 + ret <16 x i32> %resF +} + define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: ; ALL: # %bb.0: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index a798f4c..541ca9d 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2368,17 +2368,15 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm3, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm5 -; SSE41-NEXT: paddb %xmm5, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_signed_reg_reg: @@ -2390,14 +2388,13 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2429,12 +2426,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2447,12 +2442,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -2591,17 +2584,15 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; SSE41-NEXT: psubb %xmm2, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm5 -; SSE41-NEXT: pand %xmm2, %xmm5 -; SSE41-NEXT: pandn %xmm4, %xmm2 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pmullw %xmm4, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pandn %xmm4, %xmm3 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm1, %xmm5 -; SSE41-NEXT: paddb %xmm5, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: paddb %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_unsigned_reg_reg: @@ -2615,14 +2606,13 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2656,12 +2646,10 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2674,12 +2662,10 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -2822,16 +2808,14 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm3, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2845,14 +2829,13 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2886,12 +2869,10 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2],xmm3[2],xmm0[4],xmm3[4],xmm0[6],xmm3[6],xmm0[8],xmm3[8],xmm0[10],xmm3[10],xmm0[12],xmm3[12],xmm0[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2905,12 +2886,10 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2],xmm3[2],xmm0[4],xmm3[4],xmm0[6],xmm3[6],xmm0[8],xmm3[8],xmm0[10],xmm3[10],xmm0[12],xmm3[12],xmm0[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -3053,16 +3032,14 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm3, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3076,14 +3053,13 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3117,12 +3093,10 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -3136,12 +3110,10 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -3286,16 +3258,14 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pand %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm3, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pandn %xmm2, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3310,14 +3280,13 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3353,12 +3322,10 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -3373,12 +3340,10 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 -; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 7c9adaf..85791cd 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1896,40 +1896,38 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwind { ; AVX1-LABEL: vec256_i8_signed_reg_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 ; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6 ; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1943,14 +1941,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1974,15 +1971,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -1998,14 +1993,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2087,19 +2081,17 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm6, %xmm8, %xmm6 ; AVX1-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 @@ -2119,14 +2111,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2150,15 +2141,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2175,14 +2164,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2247,41 +2235,39 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind { ; AVX1-LABEL: vec256_i8_signed_mem_reg: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm5 ; AVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm6 ; AVX1-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -2296,14 +2282,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2328,15 +2313,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 @@ -2353,14 +2336,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -2443,19 +2425,17 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5 ; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 @@ -2474,14 +2454,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2506,15 +2485,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 +; XOP-NEXT: vpmullw %xmm5, %xmm2, %xmm2 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm2, %xmm2 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm3, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; XOP-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm3, %xmm3 ; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0 @@ -2531,14 +2508,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2603,44 +2579,42 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i8_signed_mem_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 -; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpminsb %xmm0, %xmm2, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpminsb %xmm1, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm7 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0 ; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5 -; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i8_signed_mem_mem: @@ -2654,14 +2628,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2687,15 +2660,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 ; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 -; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] ; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 ; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 -; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 @@ -2713,14 +2684,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index a75d42e..c058e37 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -658,9 +658,9 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -710,9 +710,9 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setbe %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -765,9 +765,9 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %ecx +; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -817,11 +817,11 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_reg_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -871,12 +871,12 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_mem_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 04f0a65..aa2dd00 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -889,19 +889,17 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm3, %ymm5 -; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4 +; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm1, %ymm1 -; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) -; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm2, %ymm3 -; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm5) +; CHECK-SKX-NOVBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm3 +; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4) +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5) ; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-SKX-NOVBMI-NEXT: vzeroupper @@ -913,20 +911,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4 +; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 -; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm1, %ymm3, %ymm4 +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm1 +; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 -; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm0, %ymm3, %ymm1 +; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, (%rdx) +; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm4, 32(%rdx) ; CHECK-SKX-VBMI-NEXT: vzeroupper ; CHECK-SKX-VBMI-NEXT: retq ; @@ -936,19 +932,17 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5 -; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm4 +; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4) -; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3 -; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm5) +; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm3 +; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4) +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5) ; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-AVX512-NEXT: vzeroupper @@ -960,20 +954,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 -; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3 +; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4 +; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] -; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 -; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 -; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2 +; CHECK-VBMI-NEXT: vpermt2b %ymm1, %ymm3, %ymm4 +; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm1 +; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 -; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-VBMI-NEXT: vpermt2b %ymm0, %ymm3, %ymm1 +; CHECK-VBMI-NEXT: vmovdqa %ymm1, (%rdx) +; CHECK-VBMI-NEXT: vmovdqa %ymm4, 32(%rdx) ; CHECK-VBMI-NEXT: vzeroupper ; CHECK-VBMI-NEXT: retq %d = load <64 x i8>, ptr %a @@ -988,13 +980,12 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-NOVBMI: # %bb.0: ; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-SKX-NOVBMI-NEXT: vpandq %zmm2, %zmm1, %zmm3 -; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; CHECK-SKX-NOVBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm3, %zmm1 ; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %zmm0, %zmm0 -; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) +; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3) ; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx) ; CHECK-SKX-NOVBMI-NEXT: vzeroupper ; CHECK-SKX-NOVBMI-NEXT: retq @@ -1003,13 +994,11 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 -; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; CHECK-SKX-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] -; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) ; CHECK-SKX-VBMI-NEXT: vzeroupper ; CHECK-SKX-VBMI-NEXT: retq @@ -1018,13 +1007,12 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3 -; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm3, %zmm1 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0 -; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) +; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3) ; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq @@ -1033,13 +1021,11 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI: # %bb.0: ; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 -; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; CHECK-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] -; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 +; CHECK-VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm1 ; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) ; CHECK-VBMI-NEXT: vzeroupper ; CHECK-VBMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 73d459b..8f97d26 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -403,11 +403,11 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: je .LBB3_1 ; X86-NEXT: # %bb.2: # %bb26.preheader ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_3: # %bb26 @@ -427,7 +427,6 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: jb .LBB3_3 ; X86-NEXT: jmp .LBB3_4 ; X86-NEXT: .LBB3_1: -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB3_4: # %bb31 ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll index b1aa789..a663f6a 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i16.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll @@ -715,8 +715,8 @@ define i16 @test_mul_by_66(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 66 @@ -757,8 +757,8 @@ define i16 @test_mul_by_520(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $9, %eax -; X64-NEXT: leal (%rax,%rdi,8), %eax +; X64-NEXT: shll $9, %edi +; X64-NEXT: leal (%rdi,%rax,8), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 520 diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll index 79889b9..4129b44 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i32.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll @@ -1155,16 +1155,16 @@ define i32 @test_mul_by_66(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $6, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax +; X64-HSW-NEXT: shll $6, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_66: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $6, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax +; X64-JAG-NEXT: shll $6, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_66: @@ -1241,16 +1241,16 @@ define i32 @test_mul_by_520(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $9, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,8), %eax +; X64-HSW-NEXT: shll $9, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,8), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_520: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $9, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,8), %eax +; X64-JAG-NEXT: shll $9, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,8), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_520: diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll index a4fa1ee..b488653 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i8.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -425,8 +425,8 @@ define i8 @test_mul_by_66(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 66 diff --git a/llvm/test/CodeGen/X86/narrow-add-i64.ll b/llvm/test/CodeGen/X86/narrow-add-i64.ll new file mode 100644 index 0000000..a7a54fd --- /dev/null +++ b/llvm/test/CodeGen/X86/narrow-add-i64.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 + +define i64 @test_add_i64_i16_const(i16 %a) nounwind { +; X86-LABEL: test_add_i64_i16_const: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $42, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: test_add_i64_i16_const: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: addq $42, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %sum = add nuw nsw i64 %zext_a, 42 + ret i64 %sum +} + +; TODO: First 48 bits are all zeros so we can safely truncate to 32 bit additon +define i64 @test_add_i64_i16_zext(i16 %a, i16 %b) nounwind { +; X86-LABEL: test_add_i64_i16_zext: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: test_add_i64_i16_zext: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %zext_b = zext i16 %b to i64 + %sum = add nuw nsw i64 %zext_a, %zext_b + ret i64 %sum +} + +; Negative: Set the 32nd bit of a to force 64 bit addition, we do not truncate to 32 bit addition in this case +define i64 @negative_test_add_i64_i16(i16 %a) nounwind { +; X86-LABEL: negative_test_add_i64_i16: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $42, %eax +; X86-NEXT: movl $1, %edx +; X86-NEXT: retl +; +; X64-LABEL: negative_test_add_i64_i16: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movabsq $4294967338, %rax # imm = 0x10000002A +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %or_a = or i64 %zext_a, 4294967296 + %sum = add nuw nsw i64 %or_a, 42 + ret i64 %sum +} + +; Negative: We don't truncate to 32 bit addition in case of sign extension +define i64 @negative_test_add_i64_i16_sext(i16 %a, i16 %b) nounwind { +; X86-LABEL: negative_test_add_i64_i16_sext: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: negative_test_add_i64_i16_sext: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %sext_a = sext i16 %a to i64 + %sext_b = sext i16 %b to i64 + %sum = add nuw nsw i64 %sext_a, %sext_b + ret i64 %sum +} diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 4b0f75d..ac45541 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -679,39 +679,39 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm1, %xmm5 -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; SSE2-NEXT: packuswb %xmm5, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: movq %xmm4, (%rsi) +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE2-NEXT: packuswb %xmm4, %xmm4 +; SSE2-NEXT: movq %xmm4, (%rdx) +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE2-NEXT: pand %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm0, %xmm4 ; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm4, (%rsi) -; SSE2-NEXT: movq %xmm5, (%rdx) ; SSE2-NEXT: movq %xmm0, (%rcx) ; SSE2-NEXT: retq ; @@ -724,16 +724,16 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; SSE42-NEXT: por %xmm2, %xmm3 +; SSE42-NEXT: movq %xmm3, (%rsi) ; SSE42-NEXT: movdqa %xmm1, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm2, %xmm4 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm2, %xmm3 +; SSE42-NEXT: movq %xmm3, (%rdx) ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; SSE42-NEXT: por %xmm1, %xmm0 -; SSE42-NEXT: movq %xmm3, (%rsi) -; SSE42-NEXT: movq %xmm4, (%rdx) ; SSE42-NEXT: movq %xmm0, (%rcx) ; SSE42-NEXT: retq ; @@ -744,14 +744,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovq %xmm2, (%rsi) +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm2, (%rdx) ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm2, (%rsi) -; AVX1-NEXT: vmovq %xmm3, (%rdx) ; AVX1-NEXT: vmovq %xmm0, (%rcx) ; AVX1-NEXT: retq ; @@ -762,14 +762,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -778,10 +778,10 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOP-NEXT: vmovdqu (%rdi), %xmm1 ; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm2, (%rsi) -; XOP-NEXT: vmovq %xmm3, (%rdx) +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u] +; XOP-NEXT: vmovq %xmm2, (%rdx) +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm0, (%rcx) ; XOP-NEXT: retq %wide.vec = load <24 x i8>, ptr %p, align 4 diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 81390e5..9f08658 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -13,9 +13,11 @@ ; CHECK-LABEL: Pass Arguments: ; CHECK-NEXT: Target Library Information +; CHECK-NEXT: Runtime Library Function Analysis ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Library Function Lowering Analysis ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis @@ -208,8 +210,6 @@ ; CHECK-NEXT: X86 Fixup Inst Tuning ; CHECK-NEXT: X86 Fixup Vector Constants ; CHECK-NEXT: Compressing EVEX instrs when possible -; CHECK-NEXT: X86 Discriminate Memory Operands -; CHECK-NEXT: X86 Insert Cache Prefetches ; CHECK-NEXT: X86 insert wait instruction ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: Remove Loads Into Fake Uses diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index 283c00e..b6af7e1 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imull %ebp, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: imull %esi, %eax ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: shrl $30, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: sarl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: sarl $31, %ebp +; CHECK-NEXT: shrl $30, %ebp +; CHECK-NEXT: addl %eax, %ebp +; CHECK-NEXT: sarl $2, %ebp +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_4: ## %bb6 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx -; CHECK-NEXT: movb %bl, (%edx,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %ebp, %esi +; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx +; CHECK-NEXT: movb %bl, (%edx,%edi) +; CHECK-NEXT: incl %edi +; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: jl LBB0_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: cmpl %edi, %ecx +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: je LBB0_12 ; CHECK-NEXT: ## %bb.6: ## %bb7.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: jmp LBB0_4 ; CHECK-NEXT: LBB0_12: ## %bb18.loopexit +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %ebp, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: jle LBB0_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, %ebp +; CHECK-NEXT: cmpl $2, %esi ; CHECK-NEXT: jl LBB0_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl %ebp, %edx -; CHECK-NEXT: shrl $31, %edx -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: sarl %edx +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: shrl $31, %ebp +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: sarl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $31, %ecx @@ -84,102 +84,103 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_10 Depth 2 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %edx, %edi ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%ecx,%esi) -; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%eax,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %edx, %esi +; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%ecx,%ebx) +; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%eax,%ebx) +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpl %ebp, %ebx ; CHECK-NEXT: jl LBB0_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpl $1, %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %eax +; CHECK-NEXT: cmpl $3, %ecx ; CHECK-NEXT: jne LBB0_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph -; CHECK-NEXT: leal 15(%edi), %eax +; CHECK-NEXT: leal 15(%edx), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebx, %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: addl %ecx, %ebx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: addl %ebp, %ebp +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: addl %ecx, %ebp +; CHECK-NEXT: addl %eax, %ebx +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %ebp, %ebx +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edi +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %esi, %edx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 -; CHECK-NEXT: movl %edi, %esi -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: LBB0_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx +; CHECK-NEXT: movl %ebp, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %edi -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %esi +; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb33 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx ; CHECK-NEXT: LBB0_23: ## %bb33 -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: sarl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 -; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB0_25: ## %return @@ -523,38 +526,38 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB1_10 Depth 2 -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: andl $1, %ebx ; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill -; CHECK-NEXT: addl %edx, %ebx -; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB1_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%eax,%esi) -; CHECK-NEXT: movzbl (%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%ecx,%esi) +; CHECK-NEXT: movzbl -2(%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%eax,%esi) +; CHECK-NEXT: movzbl (%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%ecx,%esi) ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl %ebp, %esi ; CHECK-NEXT: jb LBB1_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: incl %edx +; CHECK-NEXT: addl %ebp, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl $2, %esi ; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll index 420f5ba..31a7f11 100644 --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -219,12 +219,12 @@ define i64 @parity_64(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -264,12 +264,12 @@ define i32 @parity_64_trunc(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64_trunc: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -628,12 +628,12 @@ define i64 @parity_64_shift(i64 %0) { ; ; X64-NOPOPCNT-LABEL: parity_64_shift: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 00731fe..a1808e4 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -10,7 +10,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,u,117,u,117,u,117,u,117,u,117,u,117,u,117,u] ; SSE2-NEXT: pmullw %xmm2, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm1 @@ -25,7 +25,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -160,16 +160,14 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pmaddubsw %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: pandn %xmm1, %xmm2 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pandn %xmm1, %xmm3 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v16i8: @@ -380,7 +378,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [117,u,117,u,117,u,117,u,117,u,117,u,117,u,117,u] ; SSE2-NEXT: pmullw %xmm3, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm4, %xmm2 @@ -400,28 +398,27 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; ; SSE41-LABEL: mul_v32i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm3 +; SSE41-NEXT: pmullw %xmm2, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pmullw %xmm1, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: pmaddubsw %xmm5, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v32i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -430,7 +427,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -584,49 +581,44 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v32i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pandn %xmm2, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm6 -; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pmullw %xmm2, %xmm4 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pandn %xmm2, %xmm6 +; SSE41-NEXT: pmaddubsw %xmm6, %xmm0 ; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm2, %xmm5 -; SSE41-NEXT: pand %xmm4, %xmm5 -; SSE41-NEXT: pandn %xmm3, %xmm4 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm5, %xmm2 +; SSE41-NEXT: pandn %xmm3, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v32i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mul_v32i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v32i8: @@ -737,7 +729,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,u,117,u,117,u,117,u,117,u,117,u,117,u,117,u] ; SSE2-NEXT: pmullw %xmm4, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm5, %xmm6 @@ -773,9 +765,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; ; SSE41-LABEL: mul_v64i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 +; SSE41-NEXT: pmullw %xmm4, %xmm6 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] @@ -783,36 +775,35 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 +; SSE41-NEXT: pmullw %xmm4, %xmm6 ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: pmaddubsw %xmm7, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 +; SSE41-NEXT: pmullw %xmm4, %xmm6 ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: pmaddubsw %xmm7, %xmm2 ; SSE41-NEXT: psllw $8, %xmm2 ; SSE41-NEXT: por %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm6 -; SSE41-NEXT: pand %xmm5, %xmm6 +; SSE41-NEXT: pmullw %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: pmaddubsw %xmm7, %xmm3 ; SSE41-NEXT: psllw $8, %xmm3 -; SSE41-NEXT: por %xmm6, %xmm3 +; SSE41-NEXT: por %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v64i8c: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] -; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 @@ -822,9 +813,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; AVX512F-LABEL: mul_v64i8c: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] -; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm0 @@ -837,7 +828,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; ; AVX512BW-LABEL: mul_v64i8c: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0] +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117] ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst) @@ -899,59 +890,52 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v64i8: ; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm9 +; SSE41-NEXT: pmullw %xmm4, %xmm9 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pandn %xmm4, %xmm9 -; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm10 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm10 -; SSE41-NEXT: pand %xmm8, %xmm10 -; SSE41-NEXT: pmaddubsw %xmm9, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: por %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm5, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm9 -; SSE41-NEXT: pmaddubsw %xmm5, %xmm9 ; SSE41-NEXT: pand %xmm8, %xmm9 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pandn %xmm4, %xmm10 +; SSE41-NEXT: pmaddubsw %xmm10, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: por %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pmullw %xmm5, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pandn %xmm5, %xmm9 +; SSE41-NEXT: pmaddubsw %xmm9, %xmm1 ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm4 -; SSE41-NEXT: pandn %xmm6, %xmm4 -; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm6, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm2 +; SSE41-NEXT: por %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: pmullw %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm8, %xmm4 +; SSE41-NEXT: movdqa %xmm8, %xmm5 +; SSE41-NEXT: pandn %xmm6, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm2 ; SSE41-NEXT: psllw $8, %xmm2 -; SSE41-NEXT: por %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: por %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pmullw %xmm7, %xmm4 ; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm5 -; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: pandn %xmm7, %xmm8 ; SSE41-NEXT: pmaddubsw %xmm8, %xmm3 ; SSE41-NEXT: psllw $8, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm3 +; SSE41-NEXT: por %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v64i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 -; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm4 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpandn %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 @@ -959,33 +943,30 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6 -; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpandn %ymm1, %ymm5, %ymm1 ; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1 -; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm1 +; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2) +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm5) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandnq %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2) +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3) ; AVX512BW-NEXT: retq entry: %A = mul <64 x i8> %i, %j diff --git a/llvm/test/CodeGen/X86/pr114360.ll b/llvm/test/CodeGen/X86/pr114360.ll index cf51085..41cf06a 100644 --- a/llvm/test/CodeGen/X86/pr114360.ll +++ b/llvm/test/CodeGen/X86/pr114360.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; REQUIRES: asserts ; RUN: llc < %s -mtriple=x86_64-- -debug-counter=dagcombine=0 | FileCheck %s ; BUG: shrinkAndImmediate folds away the AND after the ZEXT has already been folded away to SUBREG_TO_REG losing implicit zext. diff --git a/llvm/test/CodeGen/X86/pr165755.ll b/llvm/test/CodeGen/X86/pr165755.ll new file mode 100644 index 0000000..3ab484f --- /dev/null +++ b/llvm/test/CodeGen/X86/pr165755.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64 + +define i32 @PR165755(ptr %p0) { +; X86-LABEL: PR165755: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %eax +; X86-NEXT: movb $0, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: PR165755: +; X64: # %bb.0: +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movb $0, (%rdi) +; X64-NEXT: retq + %ld64 = load i64, ptr %p0, align 8 + store i8 0, ptr %p0, align 1 + %ld32 = load i32, ptr %p0, align 8 + %mask = and i32 %ld32, 32 + %zext = zext i32 %mask to i64 + %srl = lshr i64 %ld64, %zext + %res = trunc i64 %srl to i32 + ret i32 %res +} diff --git a/llvm/test/CodeGen/X86/pr166058.ll b/llvm/test/CodeGen/X86/pr166058.ll new file mode 100644 index 0000000..42d68fd --- /dev/null +++ b/llvm/test/CodeGen/X86/pr166058.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s + +@out = global i32 0, align 4 +define void @bar() { +; CHECK-LABEL: bar: +; CHECK: # %bb.0: +; CHECK-NEXT: movq out@GOTPCREL(%rip), %rax +; CHECK-NEXT: #APP +; CHECK-NEXT: addl $-1, (%rax) +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: retq + call void asm "addl $1,$0", "=*m,L,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) @out, i32 -1) + ret void +} diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll new file mode 100644 index 0000000..162a0c9 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr166534.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 + +define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) { +; SSE2-LABEL: pr166534: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu (%rsi), %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %esi +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF +; SSE2-NEXT: sete %al +; SSE2-NEXT: orq %rax, (%rdx) +; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF +; SSE2-NEXT: jne .LBB0_2 +; SSE2-NEXT: # %bb.1: # %if.then +; SSE2-NEXT: orq %rax, (%rcx) +; SSE2-NEXT: .LBB0_2: # %if.end +; SSE2-NEXT: retq +; +; SSE4-LABEL: pr166534: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movdqu (%rdi), %xmm0 +; SSE4-NEXT: movdqu (%rsi), %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: xorl %eax, %eax +; SSE4-NEXT: ptest %xmm1, %xmm1 +; SSE4-NEXT: sete %al +; SSE4-NEXT: orq %rax, (%rdx) +; SSE4-NEXT: ptest %xmm1, %xmm1 +; SSE4-NEXT: jne .LBB0_2 +; SSE4-NEXT: # %bb.1: # %if.then +; SSE4-NEXT: orq %rax, (%rcx) +; SSE4-NEXT: .LBB0_2: # %if.end +; SSE4-NEXT: retq +; +; AVX2-LABEL: pr166534: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: vptest %xmm0, %xmm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: orq %rax, (%rdx) +; AVX2-NEXT: vptest %xmm0, %xmm0 +; AVX2-NEXT: jne .LBB0_2 +; AVX2-NEXT: # %bb.1: # %if.then +; AVX2-NEXT: orq %rax, (%rcx) +; AVX2-NEXT: .LBB0_2: # %if.end +; AVX2-NEXT: retq +; +; AVX512-LABEL: pr166534: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: orq %rax, (%rdx) +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: jne .LBB0_2 +; AVX512-NEXT: # %bb.1: # %if.then +; AVX512-NEXT: orq %rax, (%rcx) +; AVX512-NEXT: .LBB0_2: # %if.end +; AVX512-NEXT: retq +entry: + %a = load i128, ptr %pa, align 8 + %b = load i128, ptr %pb, align 8 + %cmp = icmp eq i128 %a, %b + %conv1 = zext i1 %cmp to i128 + %c = load i128, ptr %pc, align 8 + %or = or i128 %c, %conv1 + store i128 %or, ptr %pc, align 8 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %d = load i128, ptr %pd, align 8 + %or7 = or i128 %d, %conv1 + store i128 %or7, ptr %pd, align 8 + br label %if.end + +if.end: + ret void +} diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll new file mode 100644 index 0000000..ffdb68c --- /dev/null +++ b/llvm/test/CodeGen/X86/pr166744.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA +; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA + +; Ensure reloads are after narrowed i512 -> i32 store +define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { +; POSTRA-LABEL: PR166744: +; POSTRA: # %bb.0: +; POSTRA-NEXT: movl $1029, %eax # imm = 0x405 +; POSTRA-NEXT: shlxl %esi, %edx, %edx +; POSTRA-NEXT: bextrl %eax, %esi, %eax +; POSTRA-NEXT: movl (%rdi,%rax,4), %ecx +; POSTRA-NEXT: btrl %esi, %ecx +; POSTRA-NEXT: orl %ecx, %edx +; POSTRA-NEXT: movl %edx, (%rdi,%rax,4) +; POSTRA-NEXT: movq 16(%rdi), %rax +; POSTRA-NEXT: movq (%rdi), %rcx +; POSTRA-NEXT: movq 24(%rdi), %rdx +; POSTRA-NEXT: movq 8(%rdi), %rsi +; POSTRA-NEXT: orq 56(%rdi), %rdx +; POSTRA-NEXT: orq 40(%rdi), %rsi +; POSTRA-NEXT: orq 48(%rdi), %rax +; POSTRA-NEXT: orq 32(%rdi), %rcx +; POSTRA-NEXT: orq %rdx, %rsi +; POSTRA-NEXT: orq %rax, %rcx +; POSTRA-NEXT: orq %rsi, %rcx +; POSTRA-NEXT: setne %al +; POSTRA-NEXT: retq +; +; NOPOSTRA-LABEL: PR166744: +; NOPOSTRA: # %bb.0: +; NOPOSTRA-NEXT: movl %esi, %eax +; NOPOSTRA-NEXT: shrl $3, %esi +; NOPOSTRA-NEXT: andl $60, %esi +; NOPOSTRA-NEXT: movl (%rdi,%rsi), %ecx +; NOPOSTRA-NEXT: btrl %eax, %ecx +; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax +; NOPOSTRA-NEXT: orl %ecx, %eax +; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi) +; NOPOSTRA-NEXT: movq 16(%rdi), %rax +; NOPOSTRA-NEXT: movq (%rdi), %rcx +; NOPOSTRA-NEXT: movq 8(%rdi), %rdx +; NOPOSTRA-NEXT: movq 24(%rdi), %rsi +; NOPOSTRA-NEXT: orq 56(%rdi), %rsi +; NOPOSTRA-NEXT: orq 40(%rdi), %rdx +; NOPOSTRA-NEXT: orq 48(%rdi), %rax +; NOPOSTRA-NEXT: orq 32(%rdi), %rcx +; NOPOSTRA-NEXT: orq %rsi, %rdx +; NOPOSTRA-NEXT: orq %rax, %rcx +; NOPOSTRA-NEXT: orq %rdx, %rcx +; NOPOSTRA-NEXT: setne %al +; NOPOSTRA-NEXT: retq + %rem = and i64 %idx, 511 + %sh_prom = zext nneg i64 %rem to i512 + %shl = shl nuw i512 1, %sh_prom + %not = xor i512 %shl, -1 + %load = load i512, ptr %v, align 8 + %and = and i512 %load, %not + %conv2 = zext i1 %b to i512 + %shl4 = shl nuw i512 %conv2, %sh_prom + %or = or i512 %and, %shl4 + store i512 %or, ptr %v, align 8 + %cmp = icmp ne i512 %or, 0 + ret i1 %cmp +} diff --git a/llvm/test/CodeGen/X86/pr167793.ll b/llvm/test/CodeGen/X86/pr167793.ll new file mode 100644 index 0000000..9b394bf --- /dev/null +++ b/llvm/test/CodeGen/X86/pr167793.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s + +define <4 x double> @PR167793(<4 x double> %a0, <4 x double> %a1) { +; CHECK-LABEL: PR167793: +; CHECK: # %bb.0: +; CHECK-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vhaddpd %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; CHECK-NEXT: retq + %i5 = shufflevector <4 x double> %a0, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2> + %i6 = fadd <4 x double> %a0, %i5 + %i8 = shufflevector <4 x double> %a1, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2> + %i9 = fadd <4 x double> %a1, %i8 + %i10 = shufflevector <4 x double> %i6, <4 x double> poison, <2 x i32> <i32 poison, i32 3> + %i11 = shufflevector <4 x double> %i6, <4 x double> poison, <2 x i32> <i32 poison, i32 1> + %i12 = fadd <2 x double> %i10, %i11 + %i13 = shufflevector <4 x double> %i9, <4 x double> poison, <2 x i32> <i32 poison, i32 3> + %i14 = shufflevector <4 x double> %i9, <4 x double> poison, <2 x i32> <i32 poison, i32 1> + %i15 = fadd <2 x double> %i13, %i14 + %i16 = shufflevector <4 x double> zeroinitializer, <4 x double> poison, <2 x i32> <i32 poison, i32 1> + %i18 = shufflevector <2 x double> %i15, <2 x double> %i16, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3> + %i19 = shufflevector <2 x double> %i12, <2 x double> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison> + %i20 = shufflevector <4 x double> %i19, <4 x double> %i18, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x double> %i20 +} diff --git a/llvm/test/CodeGen/X86/pr168594.ll b/llvm/test/CodeGen/X86/pr168594.ll new file mode 100644 index 0000000..76bb132 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr168594.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX + +define <8 x i16> @PR168594() { +; SSE-LABEL: PR168594: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: PR168594: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %call = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> splat (i16 1), <8 x i16> zeroinitializer) + %sub = sub <8 x i16> zeroinitializer, %call + ret <8 x i16> %sub +} diff --git a/llvm/test/CodeGen/X86/pr169205.ll b/llvm/test/CodeGen/X86/pr169205.ll new file mode 100644 index 0000000..1416102 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr169205.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX + +define <4 x i16> @PR169205() { +; SSE-LABEL: PR169205: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1,u,u,u,u] +; SSE-NEXT: retq +; +; AVX-LABEL: PR169205: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %avg = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer) + %shuffle24 = shufflevector <16 x i8> %avg, <16 x i8> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 9, i32 9> + %conv25 = zext <4 x i8> %shuffle24 to <4 x i16> + %not.neg = add <4 x i16> %conv25, splat (i16 1) + ret <4 x i16> %not.neg +} diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll index 173c411..1a7551f 100644 --- a/llvm/test/CodeGen/X86/pr49451.ll +++ b/llvm/test/CodeGen/X86/pr49451.ll @@ -18,15 +18,15 @@ define void @func_6(i8 %uc_8, i64 %uli_10) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB0_1: # %for.body612 ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: testb %dl, %dl +; X86-NEXT: testb %bl, %bl ; X86-NEXT: je .LBB0_2 ; X86-NEXT: # %bb.3: # %if.end1401 ; X86-NEXT: # in Loop: Header=BB0_1 Depth=1 ; X86-NEXT: addl %eax, %esi ; X86-NEXT: movw %si, s_2 -; X86-NEXT: movw %bx, s_0 +; X86-NEXT: movw %dx, s_0 ; X86-NEXT: incl %ecx -; X86-NEXT: incl %ebx +; X86-NEXT: incl %edx ; X86-NEXT: cmpw $73, %cx ; X86-NEXT: jl .LBB0_1 ; X86-NEXT: # %bb.4: # %for.body1703 diff --git a/llvm/test/CodeGen/X86/pr63790.ll b/llvm/test/CodeGen/X86/pr63790.ll new file mode 100644 index 0000000..e4e7a3c --- /dev/null +++ b/llvm/test/CodeGen/X86/pr63790.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64 | FileCheck %s + +define void @f(ptr %0, i64 %1) { +; CHECK-LABEL: f: +; CHECK: # %bb.0: # %BB +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: movaps (%rdi), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $42, %edi +; CHECK-NEXT: callq *16(%rsp,%rsi,8) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rax) +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +BB: + %fps = load <2 x ptr>, ptr %0 + %fp = extractelement <2 x ptr> %fps, i64 %1 + %p = call ptr %fp(i32 42) + store <2 x ptr> %fps, ptr %p + ret void +} diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll index ad08eaf..7e00d67 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -43,25 +43,23 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) { ; AVX256-LABEL: testv16i1_sext_v16i8: ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256-NEXT: kshiftrw $8, %k1, %k1 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; ; AVX512VL-LABEL: testv16i1_sext_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -70,10 +68,8 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) { ; AVX512F-LABEL: testv16i1_sext_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -91,13 +87,13 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) { ; AVX256-LABEL: testv16i1_sext_v16i16: ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} +; AVX256-NEXT: kshiftrw $8, %k1, %k1 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 ; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX256-NEXT: retq @@ -105,10 +101,8 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) { ; AVX512VL-LABEL: testv16i1_sext_v16i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: retq @@ -116,10 +110,8 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) { ; AVX512F-LABEL: testv16i1_sext_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: retq @@ -173,27 +165,25 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) { ; AVX256-LABEL: testv16i1_zext_v16i8: ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 ; AVX256-NEXT: vpsrlw $15, %xmm1, %xmm1 +; AVX256-NEXT: kshiftrw $8, %k1, %k1 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 ; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; ; AVX512VL-LABEL: testv16i1_zext_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -202,10 +192,8 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) { ; AVX512F-LABEL: testv16i1_zext_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -223,13 +211,13 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) { ; AVX256-LABEL: testv16i1_zext_v16i16: ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} +; AVX256-NEXT: kshiftrw $8, %k1, %k1 +; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 ; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX256-NEXT: vpsrlw $15, %ymm0, %ymm0 @@ -238,10 +226,8 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) { ; AVX512VL-LABEL: testv16i1_zext_v16i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 -; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $15, %ymm0, %ymm0 @@ -250,10 +236,8 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) { ; AVX512F-LABEL: testv16i1_zext_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 3699c7f7..9338434 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -18,26 +18,23 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) { ; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z} -; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2 -; AVX256VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] -; AVX256VL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1] -; AVX256VL-NEXT: vpmovsxwd %xmm3, %ymm3 -; AVX256VL-NEXT: vpslld $31, %ymm3, %ymm3 -; AVX256VL-NEXT: vptestmd %ymm3, %ymm3, %k1 -; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; AVX256VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,12,13,2,3,u,u,6,7,u,u,14,15,0,1] -; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 -; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0 -; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 -; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3] +; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm3 {%k1} {z} +; AVX256VL-NEXT: vpmovdw %ymm3, %xmm3 +; AVX256VL-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[6,7,12,13,2,3,u,u,6,7,u,u,14,15,0,1] +; AVX256VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4],xmm2[5],xmm4[6,7] +; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7] +; AVX256VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1] +; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX256VL-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX256VL-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX256VL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256VL-NEXT: kshiftrw $8, %k1, %k1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ; AVX256VL-NEXT: vzeroupper ; AVX256VL-NEXT: retq ; @@ -135,14 +132,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 -; AVX256VL-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX256VL-NEXT: vptestmd %ymm0, %ymm0, %k3 +; AVX256VL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX256VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k3} {z} +; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256VL-NEXT: kshiftrw $8, %k2, %k2 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2 ; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 @@ -153,20 +148,15 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2 ; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] ; AVX256VL-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm1 -; AVX256VL-NEXT: vpmovsxwd %xmm2, %ymm1 -; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 -; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0 -; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 -; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} +; AVX256VL-NEXT: vpmovsxwd %ymm2, %zmm1 +; AVX256VL-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX256VL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 +; AVX256VL-NEXT: kshiftrw $8, %k1, %k1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 ; AVX256VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX256VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index 59b03f8..c9e48f8 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -58,13 +58,12 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) { define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-LABEL: test_mul_32i8: ; AVX256BW: # %bb.0: -; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 -; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX256BW-NEXT: vpmullw %ymm1, %ymm0, %ymm2 +; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256BW-NEXT: vpandn %ymm1, %ymm3, %ymm1 ; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ; AVX256BW-NEXT: vpsllw $8, %ymm0, %ymm0 -; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm2) +; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm3) ; AVX256BW-NEXT: retq ; ; AVX512BWVL-LABEL: test_mul_32i8: diff --git a/llvm/test/CodeGen/X86/regalloc-fp.ll b/llvm/test/CodeGen/X86/regalloc-fp.ll new file mode 100644 index 0000000..e89e5ab1 --- /dev/null +++ b/llvm/test/CodeGen/X86/regalloc-fp.ll @@ -0,0 +1,775 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Context: +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +define i32 @check_none() "frame-pointer"="none" { +; CHECK-LABEL: check_none: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + %reg14 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + store volatile i32 20, ptr %reg14, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = load volatile i32, ptr %reg14, align 4 + %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13 + %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + store volatile i32 %asmresult14, ptr %reg14, align 4 + ret i32 0 +} + +define i32 @test_non_leaf_no_reserve() "frame-pointer"="non-leaf-no-reserve" { +; CHECK-LABEL: test_non_leaf_no_reserve: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + %reg14 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + store volatile i32 20, ptr %reg14, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = load volatile i32, ptr %reg14, align 4 + %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13 + %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + store volatile i32 %asmresult14, ptr %reg14, align 4 + ret i32 0 +} + +define i32 @test_non_leaf() "frame-pointer"="non-leaf" { +; CHECK-LABEL: test_non_leaf: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 +; CHECK-NEXT: .cfi_offset %r13, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} + +define i32 @test_reserved() "frame-pointer"="reserved" { +; CHECK-LABEL: test_reserved: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -48 +; CHECK-NEXT: .cfi_offset %r12, -40 +; CHECK-NEXT: .cfi_offset %r13, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} + +define i32 @test_all() "frame-pointer"="all" { +; CHECK-LABEL: test_all: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: movl $0, -96(%rbp) +; CHECK-NEXT: movl $1, -92(%rbp) +; CHECK-NEXT: movl $2, -88(%rbp) +; CHECK-NEXT: movl $3, -84(%rbp) +; CHECK-NEXT: movl $4, -80(%rbp) +; CHECK-NEXT: movl $5, -76(%rbp) +; CHECK-NEXT: movl $6, -72(%rbp) +; CHECK-NEXT: movl $7, -68(%rbp) +; CHECK-NEXT: movl $8, -64(%rbp) +; CHECK-NEXT: movl $9, -60(%rbp) +; CHECK-NEXT: movl $16, -56(%rbp) +; CHECK-NEXT: movl $17, -52(%rbp) +; CHECK-NEXT: movl $18, -48(%rbp) +; CHECK-NEXT: movl $19, -44(%rbp) +; CHECK-NEXT: movl -96(%rbp), %eax +; CHECK-NEXT: movl -92(%rbp), %ecx +; CHECK-NEXT: movl -88(%rbp), %edx +; CHECK-NEXT: movl -84(%rbp), %esi +; CHECK-NEXT: movl -80(%rbp), %edi +; CHECK-NEXT: movl -76(%rbp), %r8d +; CHECK-NEXT: movl -72(%rbp), %r9d +; CHECK-NEXT: movl -68(%rbp), %r10d +; CHECK-NEXT: movl -64(%rbp), %r11d +; CHECK-NEXT: movl -60(%rbp), %ebx +; CHECK-NEXT: movl -56(%rbp), %r14d +; CHECK-NEXT: movl -52(%rbp), %r15d +; CHECK-NEXT: movl -48(%rbp), %r12d +; CHECK-NEXT: movl -44(%rbp), %r13d +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %eax, -96(%rbp) +; CHECK-NEXT: movl %ecx, -92(%rbp) +; CHECK-NEXT: movl %edx, -88(%rbp) +; CHECK-NEXT: movl %esi, -84(%rbp) +; CHECK-NEXT: movl %edi, -80(%rbp) +; CHECK-NEXT: movl %r8d, -76(%rbp) +; CHECK-NEXT: movl %r9d, -72(%rbp) +; CHECK-NEXT: movl %r10d, -68(%rbp) +; CHECK-NEXT: movl %r11d, -64(%rbp) +; CHECK-NEXT: movl %ebx, -60(%rbp) +; CHECK-NEXT: movl %r14d, -56(%rbp) +; CHECK-NEXT: movl %r15d, -52(%rbp) +; CHECK-NEXT: movl %r12d, -48(%rbp) +; CHECK-NEXT: movl %r13d, -44(%rbp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq +entry: + %reg0 = alloca i32, align 4 + %reg1 = alloca i32, align 4 + %reg2 = alloca i32, align 4 + %reg3 = alloca i32, align 4 + %reg4 = alloca i32, align 4 + %reg5 = alloca i32, align 4 + %reg6 = alloca i32, align 4 + %reg7 = alloca i32, align 4 + %reg8 = alloca i32, align 4 + %reg9 = alloca i32, align 4 + %reg10 = alloca i32, align 4 + %reg11 = alloca i32, align 4 + %reg12 = alloca i32, align 4 + %reg13 = alloca i32, align 4 + store volatile i32 0, ptr %reg0, align 4 + store volatile i32 1, ptr %reg1, align 4 + store volatile i32 2, ptr %reg2, align 4 + store volatile i32 3, ptr %reg3, align 4 + store volatile i32 4, ptr %reg4, align 4 + store volatile i32 5, ptr %reg5, align 4 + store volatile i32 6, ptr %reg6, align 4 + store volatile i32 7, ptr %reg7, align 4 + store volatile i32 8, ptr %reg8, align 4 + store volatile i32 9, ptr %reg9, align 4 + store volatile i32 16, ptr %reg10, align 4 + store volatile i32 17, ptr %reg11, align 4 + store volatile i32 18, ptr %reg12, align 4 + store volatile i32 19, ptr %reg13, align 4 + %0 = load volatile i32, ptr %reg0, align 4 + %1 = load volatile i32, ptr %reg1, align 4 + %2 = load volatile i32, ptr %reg2, align 4 + %3 = load volatile i32, ptr %reg3, align 4 + %4 = load volatile i32, ptr %reg4, align 4 + %5 = load volatile i32, ptr %reg5, align 4 + %6 = load volatile i32, ptr %reg6, align 4 + %7 = load volatile i32, ptr %reg7, align 4 + %8 = load volatile i32, ptr %reg8, align 4 + %9 = load volatile i32, ptr %reg9, align 4 + %10 = load volatile i32, ptr %reg10, align 4 + %11 = load volatile i32, ptr %reg11, align 4 + %12 = load volatile i32, ptr %reg12, align 4 + %13 = load volatile i32, ptr %reg13, align 4 + %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8 + %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9 + %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10 + %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11 + %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12 + %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13 + store volatile i32 %asmresult, ptr %reg0, align 4 + store volatile i32 %asmresult1, ptr %reg1, align 4 + store volatile i32 %asmresult2, ptr %reg2, align 4 + store volatile i32 %asmresult3, ptr %reg3, align 4 + store volatile i32 %asmresult4, ptr %reg4, align 4 + store volatile i32 %asmresult5, ptr %reg5, align 4 + store volatile i32 %asmresult6, ptr %reg6, align 4 + store volatile i32 %asmresult7, ptr %reg7, align 4 + store volatile i32 %asmresult8, ptr %reg8, align 4 + store volatile i32 %asmresult9, ptr %reg9, align 4 + store volatile i32 %asmresult10, ptr %reg10, align 4 + store volatile i32 %asmresult11, ptr %reg11, align 4 + store volatile i32 %asmresult12, ptr %reg12, align 4 + store volatile i32 %asmresult13, ptr %reg13, align 4 + ret i32 0 +} diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll index 8f046a4..26e6886 100644 --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -203,10 +203,10 @@ define i16 @no_extract_mul(i16 %i) nounwind { ; X64-LABEL: no_extract_mul: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $8, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: leal (%rax,%rax,8), %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: shrl $9, %eax ; X64-NEXT: orl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll index 948449c..147663a 100644 --- a/llvm/test/CodeGen/X86/rounding-ops.ll +++ b/llvm/test/CodeGen/X86/rounding-ops.ll @@ -60,12 +60,10 @@ define float @test3(float %x) nounwind { ; CHECK-AVX512: ## %bb.0: ; CHECK-AVX512-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: retq - %call = tail call float @nearbyintf(float %x) nounwind readnone + %call = tail call float @llvm.nearbyint.f32(float %x) nounwind readnone ret float %call } -declare float @nearbyintf(float) nounwind readnone - define double @test4(double %x) nounwind { ; CHECK-SSE-LABEL: test4: ; CHECK-SSE: ## %bb.0: @@ -81,12 +79,10 @@ define double @test4(double %x) nounwind { ; CHECK-AVX512: ## %bb.0: ; CHECK-AVX512-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: retq - %call = tail call double @nearbyint(double %x) nounwind readnone + %call = tail call double @llvm.nearbyint.f64(double %x) nounwind readnone ret double %call } -declare double @nearbyint(double) nounwind readnone - define float @test5(float %x) nounwind { ; CHECK-SSE-LABEL: test5: ; CHECK-SSE: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/scatter-schedule.ll b/llvm/test/CodeGen/X86/scatter-schedule.ll index 762a050..36bf313 100644 --- a/llvm/test/CodeGen/X86/scatter-schedule.ll +++ b/llvm/test/CodeGen/X86/scatter-schedule.ll @@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu" define void @test(i64 %x272, <16 x ptr> %x335, <16 x i32> %x270) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1} -; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorb %k0, %k0, %k1 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm0 ; CHECK-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index d018c53..23c3e84 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,NO512,SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,NO512,SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,NO512,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,NO512,AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW ; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization. @@ -26,13 +26,13 @@ define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) { ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVXANY-LABEL: ne_i128: -; AVXANY: # %bb.0: -; AVXANY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVXANY-NEXT: xorl %eax, %eax -; AVXANY-NEXT: vptest %xmm0, %xmm0 -; AVXANY-NEXT: setne %al -; AVXANY-NEXT: retq +; AVX-LABEL: ne_i128: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: setne %al +; AVX-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 %bcy = bitcast <2 x i64> %y to i128 %cmp = icmp ne i128 %bcx, %bcy @@ -58,13 +58,13 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVXANY-LABEL: eq_i128: -; AVXANY: # %bb.0: -; AVXANY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVXANY-NEXT: xorl %eax, %eax -; AVXANY-NEXT: vptest %xmm0, %xmm0 -; AVXANY-NEXT: sete %al -; AVXANY-NEXT: retq +; AVX-LABEL: eq_i128: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %bcx = bitcast <2 x i64> %x to i128 %bcy = bitcast <2 x i64> %y to i128 %cmp = icmp eq i128 %bcx, %bcy @@ -722,39 +722,27 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: shrq $32, %rax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: shrq $32, %r10 -; AVX512-NEXT: vpinsrd $3, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: vmovd %r8d, %xmm1 -; AVX512-NEXT: shrq $32, %r8 -; AVX512-NEXT: vpinsrd $1, %r8d, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %r10, %xmm0 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: shrq $32, %r9 -; AVX512-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %r9, %xmm1 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r8 +; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: vmovd %edx, %xmm1 -; AVX512-NEXT: shrq $32, %rdx -; AVX512-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX512-NEXT: shrq $32, %rcx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: shrq $32, %rdi -; AVX512-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: vmovq %rdx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: shrq $32, %rsi -; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %rsi, %xmm2 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdi +; AVX512-NEXT: vmovq %rdi, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -797,17 +785,17 @@ define i32 @ne_i128_pair(ptr %a, ptr %b) { ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVXANY-LABEL: ne_i128_pair: -; AVXANY: # %bb.0: -; AVXANY-NEXT: vmovdqa (%rdi), %xmm0 -; AVXANY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVXANY-NEXT: vpxor 16(%rsi), %xmm1, %xmm1 -; AVXANY-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVXANY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVXANY-NEXT: xorl %eax, %eax -; AVXANY-NEXT: vptest %xmm0, %xmm0 -; AVXANY-NEXT: setne %al -; AVXANY-NEXT: retq +; AVX-LABEL: ne_i128_pair: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: setne %al +; AVX-NEXT: retq %a0 = load i128, ptr %a %b0 = load i128, ptr %b %xor1 = xor i128 %a0, %b0 @@ -851,17 +839,17 @@ define i32 @eq_i128_pair(ptr %a, ptr %b) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVXANY-LABEL: eq_i128_pair: -; AVXANY: # %bb.0: -; AVXANY-NEXT: vmovdqa (%rdi), %xmm0 -; AVXANY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVXANY-NEXT: vpxor 16(%rsi), %xmm1, %xmm1 -; AVXANY-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVXANY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVXANY-NEXT: xorl %eax, %eax -; AVXANY-NEXT: vptest %xmm0, %xmm0 -; AVXANY-NEXT: sete %al -; AVXANY-NEXT: retq +; AVX-LABEL: eq_i128_pair: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: retq %a0 = load i128, ptr %a %b0 = load i128, ptr %b %xor1 = xor i128 %a0, %b0 @@ -1236,90 +1224,90 @@ define i32 @eq_i512_pair(ptr %a, ptr %b) { ; PR41971: Comparison using vector types is not favorable here. define i1 @eq_i128_args(i128 %a, i128 %b) { -; ANY-LABEL: eq_i128_args: -; ANY: # %bb.0: -; ANY-NEXT: xorq %rcx, %rsi -; ANY-NEXT: xorq %rdx, %rdi -; ANY-NEXT: orq %rsi, %rdi -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; CHECK-LABEL: eq_i128_args: +; CHECK: # %bb.0: +; CHECK-NEXT: xorq %rcx, %rsi +; CHECK-NEXT: xorq %rdx, %rdi +; CHECK-NEXT: orq %rsi, %rdi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %r = icmp eq i128 %a, %b ret i1 %r } define i1 @eq_i256_args(i256 %a, i256 %b) { -; ANY-LABEL: eq_i256_args: -; ANY: # %bb.0: -; ANY-NEXT: xorq %r9, %rsi -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; ANY-NEXT: orq %rsi, %rcx -; ANY-NEXT: xorq %r8, %rdi -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; ANY-NEXT: orq %rdi, %rdx -; ANY-NEXT: orq %rcx, %rdx -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; CHECK-LABEL: eq_i256_args: +; CHECK: # %bb.0: +; CHECK-NEXT: xorq %r9, %rsi +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: orq %rsi, %rcx +; CHECK-NEXT: xorq %r8, %rdi +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: orq %rdi, %rdx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %r = icmp eq i256 %a, %b ret i1 %r } define i1 @eq_i512_args(i512 %a, i512 %b) { -; ANY-LABEL: eq_i512_args: -; ANY: # %bb.0: -; ANY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; ANY-NEXT: orq %r10, %rcx -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi -; ANY-NEXT: orq %r9, %rsi -; ANY-NEXT: orq %rcx, %rsi -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; ANY-NEXT: orq %rax, %rdx -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi -; ANY-NEXT: orq %r8, %rdi -; ANY-NEXT: orq %rdx, %rdi -; ANY-NEXT: orq %rsi, %rdi -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; CHECK-LABEL: eq_i512_args: +; CHECK: # %bb.0: +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: orq %r10, %rcx +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r9 +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: orq %r9, %rsi +; CHECK-NEXT: orq %rcx, %rsi +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: orq %rax, %rdx +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r8 +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: orq %r8, %rdi +; CHECK-NEXT: orq %rdx, %rdi +; CHECK-NEXT: orq %rsi, %rdi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %r = icmp eq i512 %a, %b ret i1 %r } define i1 @eq_i128_op(i128 %a, i128 %b) { -; ANY-LABEL: eq_i128_op: -; ANY: # %bb.0: -; ANY-NEXT: addq $1, %rdi -; ANY-NEXT: adcq $0, %rsi -; ANY-NEXT: xorq %rdx, %rdi -; ANY-NEXT: xorq %rcx, %rsi -; ANY-NEXT: orq %rdi, %rsi -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; CHECK-LABEL: eq_i128_op: +; CHECK: # %bb.0: +; CHECK-NEXT: addq $1, %rdi +; CHECK-NEXT: adcq $0, %rsi +; CHECK-NEXT: xorq %rdx, %rdi +; CHECK-NEXT: xorq %rcx, %rsi +; CHECK-NEXT: orq %rdi, %rsi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %a2 = add i128 %a, 1 %r = icmp eq i128 %a2, %b ret i1 %r } define i1 @eq_i256_op(i256 %a, i256 %b) { -; ANY-LABEL: eq_i256_op: -; ANY: # %bb.0: -; ANY-NEXT: addq $1, %rdi -; ANY-NEXT: adcq $0, %rsi -; ANY-NEXT: adcq $0, %rdx -; ANY-NEXT: adcq $0, %rcx -; ANY-NEXT: xorq %r8, %rdi -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; ANY-NEXT: orq %rdi, %rdx -; ANY-NEXT: xorq %r9, %rsi -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; ANY-NEXT: orq %rsi, %rcx -; ANY-NEXT: orq %rdx, %rcx -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; CHECK-LABEL: eq_i256_op: +; CHECK: # %bb.0: +; CHECK-NEXT: addq $1, %rdi +; CHECK-NEXT: adcq $0, %rsi +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: adcq $0, %rcx +; CHECK-NEXT: xorq %r8, %rdi +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: orq %rdi, %rdx +; CHECK-NEXT: xorq %r9, %rsi +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: orq %rsi, %rcx +; CHECK-NEXT: orq %rdx, %rcx +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %a2 = add i256 %a, 1 %r = icmp eq i256 %a2, %b ret i1 %r @@ -1356,93 +1344,93 @@ define i1 @eq_i512_op(i512 %a, i512 %b) { ; SSE-NEXT: sete %al ; SSE-NEXT: retq ; -; AVXANY-LABEL: eq_i512_op: -; AVXANY: # %bb.0: -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: addq $1, %rdi -; AVXANY-NEXT: adcq $0, %rsi -; AVXANY-NEXT: adcq $0, %rdx -; AVXANY-NEXT: adcq $0, %rcx -; AVXANY-NEXT: adcq $0, %r8 -; AVXANY-NEXT: adcq $0, %r9 -; AVXANY-NEXT: adcq $0, %r10 -; AVXANY-NEXT: adcq $0, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; AVXANY-NEXT: orq %rsi, %r9 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: orq %rcx, %rax -; AVXANY-NEXT: orq %r9, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: orq %rdx, %r10 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi -; AVXANY-NEXT: orq %r8, %rdi -; AVXANY-NEXT: orq %r10, %rdi -; AVXANY-NEXT: orq %rax, %rdi -; AVXANY-NEXT: sete %al -; AVXANY-NEXT: retq +; AVX-LABEL: eq_i512_op: +; AVX: # %bb.0: +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: addq $1, %rdi +; AVX-NEXT: adcq $0, %rsi +; AVX-NEXT: adcq $0, %rdx +; AVX-NEXT: adcq $0, %rcx +; AVX-NEXT: adcq $0, %r8 +; AVX-NEXT: adcq $0, %r9 +; AVX-NEXT: adcq $0, %r10 +; AVX-NEXT: adcq $0, %rax +; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rsi +; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %r9 +; AVX-NEXT: orq %rsi, %r9 +; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: orq %rcx, %rax +; AVX-NEXT: orq %r9, %rax +; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rdx +; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %r10 +; AVX-NEXT: orq %rdx, %r10 +; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %r8 +; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rdi +; AVX-NEXT: orq %r8, %rdi +; AVX-NEXT: orq %r10, %rdi +; AVX-NEXT: orq %rax, %rdi +; AVX-NEXT: sete %al +; AVX-NEXT: retq %a2 = add i512 %a, 1 %r = icmp eq i512 %a2, %b ret i1 %r } define i1 @eq_i128_load_arg(ptr%p, i128 %b) { -; ANY-LABEL: eq_i128_load_arg: -; ANY: # %bb.0: -; ANY-NEXT: xorq 8(%rdi), %rdx -; ANY-NEXT: xorq (%rdi), %rsi -; ANY-NEXT: orq %rdx, %rsi -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; CHECK-LABEL: eq_i128_load_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: xorq 8(%rdi), %rdx +; CHECK-NEXT: xorq (%rdi), %rsi +; CHECK-NEXT: orq %rdx, %rsi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %a = load i128, ptr %p %r = icmp eq i128 %a, %b ret i1 %r } define i1 @eq_i256_load_arg(ptr%p, i256 %b) { -; ANY-LABEL: eq_i256_load_arg: -; ANY: # %bb.0: -; ANY-NEXT: xorq 24(%rdi), %r8 -; ANY-NEXT: xorq 8(%rdi), %rdx -; ANY-NEXT: orq %r8, %rdx -; ANY-NEXT: xorq 16(%rdi), %rcx -; ANY-NEXT: xorq (%rdi), %rsi -; ANY-NEXT: orq %rcx, %rsi -; ANY-NEXT: orq %rdx, %rsi -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; CHECK-LABEL: eq_i256_load_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: xorq 24(%rdi), %r8 +; CHECK-NEXT: xorq 8(%rdi), %rdx +; CHECK-NEXT: orq %r8, %rdx +; CHECK-NEXT: xorq 16(%rdi), %rcx +; CHECK-NEXT: xorq (%rdi), %rsi +; CHECK-NEXT: orq %rcx, %rsi +; CHECK-NEXT: orq %rdx, %rsi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %a = load i256, ptr %p %r = icmp eq i256 %a, %b ret i1 %r } define i1 @eq_i512_load_arg(ptr%p, i512 %b) { -; ANY-LABEL: eq_i512_load_arg: -; ANY: # %bb.0: -; ANY-NEXT: movq 40(%rdi), %rax -; ANY-NEXT: movq 48(%rdi), %r10 -; ANY-NEXT: movq 56(%rdi), %r11 -; ANY-NEXT: xorq 24(%rdi), %r8 -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r11 -; ANY-NEXT: orq %r8, %r11 -; ANY-NEXT: xorq 8(%rdi), %rdx -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; ANY-NEXT: orq %rdx, %rax -; ANY-NEXT: orq %r11, %rax -; ANY-NEXT: xorq 32(%rdi), %r9 -; ANY-NEXT: xorq (%rdi), %rsi -; ANY-NEXT: orq %r9, %rsi -; ANY-NEXT: xorq 16(%rdi), %rcx -; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; ANY-NEXT: orq %rcx, %r10 -; ANY-NEXT: orq %rsi, %r10 -; ANY-NEXT: orq %rax, %r10 -; ANY-NEXT: sete %al -; ANY-NEXT: retq +; CHECK-LABEL: eq_i512_load_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: movq 40(%rdi), %rax +; CHECK-NEXT: movq 48(%rdi), %r10 +; CHECK-NEXT: movq 56(%rdi), %r11 +; CHECK-NEXT: xorq 24(%rdi), %r8 +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r11 +; CHECK-NEXT: orq %r8, %r11 +; CHECK-NEXT: xorq 8(%rdi), %rdx +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: orq %rdx, %rax +; CHECK-NEXT: orq %r11, %rax +; CHECK-NEXT: xorq 32(%rdi), %r9 +; CHECK-NEXT: xorq (%rdi), %rsi +; CHECK-NEXT: orq %r9, %rsi +; CHECK-NEXT: xorq 16(%rdi), %rcx +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: orq %rcx, %r10 +; CHECK-NEXT: orq %rsi, %r10 +; CHECK-NEXT: orq %rax, %r10 +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %a = load i512, ptr %p %r = icmp eq i512 %a, %b ret i1 %r @@ -1451,12 +1439,12 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) { ; Tests for any/allbits from memory. define i1 @anybits_i128_load_arg(ptr %w) { -; ANY-LABEL: anybits_i128_load_arg: -; ANY: # %bb.0: -; ANY-NEXT: movq (%rdi), %rax -; ANY-NEXT: orq 8(%rdi), %rax -; ANY-NEXT: setne %al -; ANY-NEXT: retq +; CHECK-LABEL: anybits_i128_load_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: orq 8(%rdi), %rax +; CHECK-NEXT: setne %al +; CHECK-NEXT: retq %ld = load i128, ptr %w %cmp = icmp ne i128 %ld, 0 ret i1 %cmp @@ -1480,13 +1468,13 @@ define i1 @allbits_i128_load_arg(ptr %w) { ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVXANY-LABEL: allbits_i128_load_arg: -; AVXANY: # %bb.0: -; AVXANY-NEXT: vmovdqa (%rdi), %xmm0 -; AVXANY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVXANY-NEXT: vptest %xmm1, %xmm0 -; AVXANY-NEXT: setb %al -; AVXANY-NEXT: retq +; AVX-LABEL: allbits_i128_load_arg: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vptest %xmm1, %xmm0 +; AVX-NEXT: setb %al +; AVX-NEXT: retq %ld = load i128, ptr %w %cmp = icmp eq i128 %ld, -1 ret i1 %cmp @@ -1503,13 +1491,13 @@ define i1 @anybits_i256_load_arg(ptr %w) { ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; -; AVXANY-LABEL: anybits_i256_load_arg: -; AVXANY: # %bb.0: -; AVXANY-NEXT: vmovdqu (%rdi), %ymm0 -; AVXANY-NEXT: vptest %ymm0, %ymm0 -; AVXANY-NEXT: setne %al -; AVXANY-NEXT: vzeroupper -; AVXANY-NEXT: retq +; AVX-LABEL: anybits_i256_load_arg: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqu (%rdi), %ymm0 +; AVX-NEXT: vptest %ymm0, %ymm0 +; AVX-NEXT: setne %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %ld = load i256, ptr %w %cmp = icmp ne i256 %ld, 0 ret i1 %cmp diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll index 03b61d9..4d341f1 100644 --- a/llvm/test/CodeGen/X86/shift-i512.ll +++ b/llvm/test/CodeGen/X86/shift-i512.ll @@ -1,208 +1,2050 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s -check-prefixes=AVX512VL -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+avx512vbmi2 | FileCheck %s -check-prefixes=AVX512VBMI -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s -check-prefixes=ZNVER4 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s -check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s -check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s -check-prefixes=CHECK,AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi2 | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512VBMI -; i512 shifts hidden inside 512-bit vectors. +define i512 @shl_i512(i512 %a0, i512 %a1) nounwind { +; SSE-LABEL: shl_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: negl %eax +; SSE-NEXT: cltq +; SSE-NEXT: movq -56(%rsp,%rax), %rdx +; SSE-NEXT: movq -48(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %rsi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: movq -40(%rsp,%rax), %r10 +; SSE-NEXT: movq %r10, %r8 +; SSE-NEXT: shldq %cl, %r9, %r8 +; SSE-NEXT: movq -32(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %r11 +; SSE-NEXT: shldq %cl, %r10, %r11 +; SSE-NEXT: movq -24(%rsp,%rax), %r10 +; SSE-NEXT: movq %r10, %rbx +; SSE-NEXT: shldq %cl, %r9, %rbx +; SSE-NEXT: movq -16(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %r14 +; SSE-NEXT: shldq %cl, %r10, %r14 +; SSE-NEXT: movq -8(%rsp,%rax), %r10 +; SSE-NEXT: shldq %cl, %r9, %r10 +; SSE-NEXT: movq -64(%rsp,%rax), %rax +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq %r10, 56(%rdi) +; SSE-NEXT: movq %r14, 48(%rdi) +; SSE-NEXT: movq %rbx, 40(%rdi) +; SSE-NEXT: movq %r11, 32(%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %rdx, 8(%rdi) +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: shl_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: movslq %eax, %r8 +; AVX2-NEXT: movq -56(%rsp,%r8), %rdx +; AVX2-NEXT: movq -48(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shldq %cl, %rdx, %rsi +; AVX2-NEXT: movq -40(%rsp,%r8), %r10 +; AVX2-NEXT: movq %r10, %r9 +; AVX2-NEXT: shldq %cl, %rax, %r9 +; AVX2-NEXT: movq -32(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: movq -24(%rsp,%r8), %r10 +; AVX2-NEXT: movq %r10, %rbx +; AVX2-NEXT: shldq %cl, %rax, %rbx +; AVX2-NEXT: movq -16(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %r14 +; AVX2-NEXT: shldq %cl, %r10, %r14 +; AVX2-NEXT: movq -8(%rsp,%r8), %r10 +; AVX2-NEXT: shldq %cl, %rax, %r10 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq -64(%rsp,%r8), %rdi +; AVX2-NEXT: shlxq %rcx, %rdi, %r8 +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rdi, %rdx +; AVX2-NEXT: movq %r10, 56(%rax) +; AVX2-NEXT: movq %r14, 48(%rax) +; AVX2-NEXT: movq %rbx, 40(%rax) +; AVX2-NEXT: movq %r11, 32(%rax) +; AVX2-NEXT: movq %r9, 24(%rax) +; AVX2-NEXT: movq %rsi, 16(%rax) +; AVX2-NEXT: movq %rdx, 8(%rax) +; AVX2-NEXT: movq %r8, (%rax) +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: shl_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %eax +; AVX512F-NEXT: andl $56, %eax +; AVX512F-NEXT: negl %eax +; AVX512F-NEXT: movslq %eax, %r8 +; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx +; AVX512F-NEXT: movq -48(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: shldq %cl, %rdx, %rsi +; AVX512F-NEXT: movq -40(%rsp,%r8), %r10 +; AVX512F-NEXT: movq %r10, %r9 +; AVX512F-NEXT: shldq %cl, %rax, %r9 +; AVX512F-NEXT: movq -32(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: shldq %cl, %r10, %r11 +; AVX512F-NEXT: movq -24(%rsp,%r8), %r10 +; AVX512F-NEXT: movq %r10, %rbx +; AVX512F-NEXT: shldq %cl, %rax, %rbx +; AVX512F-NEXT: movq -16(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: shldq %cl, %r10, %r14 +; AVX512F-NEXT: movq -8(%rsp,%r8), %r10 +; AVX512F-NEXT: shldq %cl, %rax, %r10 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi +; AVX512F-NEXT: shlxq %rcx, %rdi, %r8 +; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512F-NEXT: shldq %cl, %rdi, %rdx +; AVX512F-NEXT: movq %r10, 56(%rax) +; AVX512F-NEXT: movq %r14, 48(%rax) +; AVX512F-NEXT: movq %rbx, 40(%rax) +; AVX512F-NEXT: movq %r11, 32(%rax) +; AVX512F-NEXT: movq %r9, 24(%rax) +; AVX512F-NEXT: movq %rsi, 16(%rax) +; AVX512F-NEXT: movq %rdx, 8(%rax) +; AVX512F-NEXT: movq %r8, (%rax) +; AVX512F-NEXT: addq $8, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shl_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r15 +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %eax +; AVX512VL-NEXT: andl $56, %eax +; AVX512VL-NEXT: negl %eax +; AVX512VL-NEXT: movslq %eax, %r9 +; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx +; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax +; AVX512VL-NEXT: movq %rax, %rsi +; AVX512VL-NEXT: shldq %cl, %rdx, %rsi +; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10 +; AVX512VL-NEXT: movq %r10, %r8 +; AVX512VL-NEXT: shldq %cl, %rax, %r8 +; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11 +; AVX512VL-NEXT: movq %r11, %rbx +; AVX512VL-NEXT: shldq %cl, %r10, %rbx +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi +; AVX512VL-NEXT: movq %rdi, %r10 +; AVX512VL-NEXT: shldq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11 +; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14 +; AVX512VL-NEXT: movq %r14, %r15 +; AVX512VL-NEXT: shldq %cl, %rdi, %r15 +; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi +; AVX512VL-NEXT: shldq %cl, %r14, %rdi +; AVX512VL-NEXT: shlxq %rcx, %r11, %r9 +; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512VL-NEXT: shldq %cl, %r11, %rdx +; AVX512VL-NEXT: movq %rdi, 56(%rax) +; AVX512VL-NEXT: movq %r15, 48(%rax) +; AVX512VL-NEXT: movq %r10, 40(%rax) +; AVX512VL-NEXT: movq %rbx, 32(%rax) +; AVX512VL-NEXT: movq %r8, 24(%rax) +; AVX512VL-NEXT: movq %rsi, 16(%rax) +; AVX512VL-NEXT: movq %rdx, 8(%rax) +; AVX512VL-NEXT: movq %r9, (%rax) +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: popq %r15 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VBMI-LABEL: shl_i512: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: pushq %r15 +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %eax, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %eax +; AVX512VBMI-NEXT: andl $56, %eax +; AVX512VBMI-NEXT: negl %eax +; AVX512VBMI-NEXT: movslq %eax, %r9 +; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx +; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax +; AVX512VBMI-NEXT: movq %rax, %rsi +; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi +; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10 +; AVX512VBMI-NEXT: movq %r10, %r8 +; AVX512VBMI-NEXT: shldq %cl, %rax, %r8 +; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11 +; AVX512VBMI-NEXT: movq %r11, %rbx +; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi +; AVX512VBMI-NEXT: movq %rdi, %r10 +; AVX512VBMI-NEXT: shldq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11 +; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14 +; AVX512VBMI-NEXT: movq %r14, %r15 +; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15 +; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi +; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi +; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9 +; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx +; AVX512VBMI-NEXT: movq %rdi, 56(%rax) +; AVX512VBMI-NEXT: movq %r15, 48(%rax) +; AVX512VBMI-NEXT: movq %r10, 40(%rax) +; AVX512VBMI-NEXT: movq %rbx, 32(%rax) +; AVX512VBMI-NEXT: movq %r8, 24(%rax) +; AVX512VBMI-NEXT: movq %rsi, 16(%rax) +; AVX512VBMI-NEXT: movq %rdx, 8(%rax) +; AVX512VBMI-NEXT: movq %r9, (%rax) +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: popq %r15 +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq + %r = shl i512 %a0, %a1 + ret i512 %r +} -define <8 x i64> @shl_i512_1(<8 x i64> %a) { -; AVX512VL-LABEL: shl_i512_1: +define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind { +; SSE-LABEL: lshr_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: movq -112(%rsp,%rax), %rdx +; SSE-NEXT: movq -120(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %rsi +; SSE-NEXT: shrdq %cl, %rdx, %rsi +; SSE-NEXT: movq -104(%rsp,%rax), %r8 +; SSE-NEXT: shrdq %cl, %r8, %rdx +; SSE-NEXT: movq -96(%rsp,%rax), %r10 +; SSE-NEXT: shrdq %cl, %r10, %r8 +; SSE-NEXT: movq -88(%rsp,%rax), %r11 +; SSE-NEXT: shrdq %cl, %r11, %r10 +; SSE-NEXT: movq -80(%rsp,%rax), %rbx +; SSE-NEXT: shrdq %cl, %rbx, %r11 +; SSE-NEXT: movq -72(%rsp,%rax), %r14 +; SSE-NEXT: shrdq %cl, %r14, %rbx +; SSE-NEXT: movq -128(%rsp,%rax), %r15 +; SSE-NEXT: shrdq %cl, %r9, %r15 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shrq %cl, %r14 +; SSE-NEXT: movq %r14, 56(%rdi) +; SSE-NEXT: movq %rbx, 48(%rdi) +; SSE-NEXT: movq %r11, 40(%rdi) +; SSE-NEXT: movq %r10, 32(%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: movq %r15, (%rdi) +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: lshr_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: movq -112(%rsp,%rax), %rdx +; AVX2-NEXT: movq -120(%rsp,%rax), %r9 +; AVX2-NEXT: movq %r9, %rsi +; AVX2-NEXT: shrdq %cl, %rdx, %rsi +; AVX2-NEXT: movq -104(%rsp,%rax), %r8 +; AVX2-NEXT: shrdq %cl, %r8, %rdx +; AVX2-NEXT: movq -96(%rsp,%rax), %r10 +; AVX2-NEXT: shrdq %cl, %r10, %r8 +; AVX2-NEXT: movq -88(%rsp,%rax), %r11 +; AVX2-NEXT: shrdq %cl, %r11, %r10 +; AVX2-NEXT: movq -80(%rsp,%rax), %rbx +; AVX2-NEXT: shrdq %cl, %rbx, %r11 +; AVX2-NEXT: movq -128(%rsp,%rax), %r14 +; AVX2-NEXT: movq -72(%rsp,%rax), %r15 +; AVX2-NEXT: shrdq %cl, %r15, %rbx +; AVX2-NEXT: shrdq %cl, %r9, %r14 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrxq %rcx, %r15, %rcx +; AVX2-NEXT: movq %rcx, 56(%rdi) +; AVX2-NEXT: movq %rbx, 48(%rdi) +; AVX2-NEXT: movq %r11, 40(%rdi) +; AVX2-NEXT: movq %r10, 32(%rdi) +; AVX2-NEXT: movq %r8, 24(%rdi) +; AVX2-NEXT: movq %rdx, 16(%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: lshr_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r15 +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %eax +; AVX512F-NEXT: andl $56, %eax +; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512F-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512F-NEXT: movq %r9, %rsi +; AVX512F-NEXT: shrdq %cl, %rdx, %rsi +; AVX512F-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512F-NEXT: shrdq %cl, %r8, %rdx +; AVX512F-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512F-NEXT: shrdq %cl, %r10, %r8 +; AVX512F-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512F-NEXT: shrdq %cl, %r11, %r10 +; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512F-NEXT: shrdq %cl, %rbx, %r11 +; AVX512F-NEXT: movq -128(%rsp,%rax), %r14 +; AVX512F-NEXT: movq -72(%rsp,%rax), %r15 +; AVX512F-NEXT: shrdq %cl, %r15, %rbx +; AVX512F-NEXT: shrdq %cl, %r9, %r14 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: shrxq %rcx, %r15, %rcx +; AVX512F-NEXT: movq %rcx, 56(%rdi) +; AVX512F-NEXT: movq %rbx, 48(%rdi) +; AVX512F-NEXT: movq %r11, 40(%rdi) +; AVX512F-NEXT: movq %r10, 32(%rdi) +; AVX512F-NEXT: movq %r8, 24(%rdi) +; AVX512F-NEXT: movq %rdx, 16(%rdi) +; AVX512F-NEXT: movq %rsi, 8(%rdi) +; AVX512F-NEXT: movq %r14, (%rdi) +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: popq %r15 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: lshr_i512: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm3 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512VL-NEXT: vpsrlq $63, %xmm4, %xmm4 -; AVX512VL-NEXT: vpaddq %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpaddq %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlq $63, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512VL-NEXT: vpsrlq $63, %zmm0, %zmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] +; AVX512VL-NEXT: pushq %r15 +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %eax +; AVX512VL-NEXT: andl $56, %eax +; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512VL-NEXT: movq %r9, %rsi +; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi +; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512VL-NEXT: shrdq %cl, %r8, %rdx +; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512VL-NEXT: shrdq %cl, %r10, %r8 +; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512VL-NEXT: shrdq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512VL-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14 +; AVX512VL-NEXT: shrdq %cl, %r14, %rbx +; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15 +; AVX512VL-NEXT: shrdq %cl, %r9, %r15 +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx +; AVX512VL-NEXT: movq %rcx, 56(%rdi) +; AVX512VL-NEXT: movq %rbx, 48(%rdi) +; AVX512VL-NEXT: movq %r11, 40(%rdi) +; AVX512VL-NEXT: movq %r10, 32(%rdi) +; AVX512VL-NEXT: movq %r8, 24(%rdi) +; AVX512VL-NEXT: movq %rdx, 16(%rdi) +; AVX512VL-NEXT: movq %rsi, 8(%rdi) +; AVX512VL-NEXT: movq %r15, (%rdi) +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: popq %r15 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VBMI-LABEL: shl_i512_1: +; AVX512VBMI-LABEL: lshr_i512: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512VBMI-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 -; AVX512VBMI-NEXT: vpaddq %xmm0, %xmm0, %xmm4 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VBMI-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1 -; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VBMI-NEXT: vpshldq $1, %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] +; AVX512VBMI-NEXT: pushq %r15 +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %eax, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %eax +; AVX512VBMI-NEXT: andl $56, %eax +; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512VBMI-NEXT: movq %r9, %rsi +; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi +; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx +; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8 +; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14 +; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx +; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15 +; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15 +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx +; AVX512VBMI-NEXT: movq %rcx, 56(%rdi) +; AVX512VBMI-NEXT: movq %rbx, 48(%rdi) +; AVX512VBMI-NEXT: movq %r11, 40(%rdi) +; AVX512VBMI-NEXT: movq %r10, 32(%rdi) +; AVX512VBMI-NEXT: movq %r8, 24(%rdi) +; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) +; AVX512VBMI-NEXT: movq %rsi, 8(%rdi) +; AVX512VBMI-NEXT: movq %r15, (%rdi) +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: popq %r15 +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq + %r = lshr i512 %a0, %a1 + ret i512 %r +} + +define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind { +; SSE-LABEL: ashr_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; SSE-NEXT: sarq $63, %r10 +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %eax +; SSE-NEXT: andl $56, %eax +; SSE-NEXT: movq -112(%rsp,%rax), %rdx +; SSE-NEXT: movq -120(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %rsi +; SSE-NEXT: shrdq %cl, %rdx, %rsi +; SSE-NEXT: movq -104(%rsp,%rax), %r8 +; SSE-NEXT: shrdq %cl, %r8, %rdx +; SSE-NEXT: movq -96(%rsp,%rax), %r10 +; SSE-NEXT: shrdq %cl, %r10, %r8 +; SSE-NEXT: movq -88(%rsp,%rax), %r11 +; SSE-NEXT: shrdq %cl, %r11, %r10 +; SSE-NEXT: movq -80(%rsp,%rax), %rbx +; SSE-NEXT: shrdq %cl, %rbx, %r11 +; SSE-NEXT: movq -72(%rsp,%rax), %r14 +; SSE-NEXT: shrdq %cl, %r14, %rbx +; SSE-NEXT: movq -128(%rsp,%rax), %r15 +; SSE-NEXT: shrdq %cl, %r9, %r15 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: sarq %cl, %r14 +; SSE-NEXT: movq %r14, 56(%rdi) +; SSE-NEXT: movq %rbx, 48(%rdi) +; SSE-NEXT: movq %r11, 40(%rdi) +; SSE-NEXT: movq %r10, 32(%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: movq %r15, (%rdi) +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX2-LABEL: ashr_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $56, %eax +; AVX2-NEXT: movq -112(%rsp,%rax), %rdx +; AVX2-NEXT: movq -120(%rsp,%rax), %r9 +; AVX2-NEXT: movq %r9, %rsi +; AVX2-NEXT: shrdq %cl, %rdx, %rsi +; AVX2-NEXT: movq -104(%rsp,%rax), %r8 +; AVX2-NEXT: shrdq %cl, %r8, %rdx +; AVX2-NEXT: movq -96(%rsp,%rax), %r10 +; AVX2-NEXT: shrdq %cl, %r10, %r8 +; AVX2-NEXT: movq -88(%rsp,%rax), %r11 +; AVX2-NEXT: shrdq %cl, %r11, %r10 +; AVX2-NEXT: movq -80(%rsp,%rax), %rbx +; AVX2-NEXT: shrdq %cl, %rbx, %r11 +; AVX2-NEXT: movq -128(%rsp,%rax), %r14 +; AVX2-NEXT: movq -72(%rsp,%rax), %r15 +; AVX2-NEXT: shrdq %cl, %r15, %rbx +; AVX2-NEXT: shrdq %cl, %r9, %r14 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: sarxq %rcx, %r15, %rcx +; AVX2-NEXT: movq %rcx, 56(%rdi) +; AVX2-NEXT: movq %rbx, 48(%rdi) +; AVX2-NEXT: movq %r11, 40(%rdi) +; AVX2-NEXT: movq %r10, 32(%rdi) +; AVX2-NEXT: movq %r8, 24(%rdi) +; AVX2-NEXT: movq %rdx, 16(%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: ashr_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r15 +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: sarq $63, %r10 +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %eax +; AVX512F-NEXT: andl $56, %eax +; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512F-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512F-NEXT: movq %r9, %rsi +; AVX512F-NEXT: shrdq %cl, %rdx, %rsi +; AVX512F-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512F-NEXT: shrdq %cl, %r8, %rdx +; AVX512F-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512F-NEXT: shrdq %cl, %r10, %r8 +; AVX512F-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512F-NEXT: shrdq %cl, %r11, %r10 +; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512F-NEXT: shrdq %cl, %rbx, %r11 +; AVX512F-NEXT: movq -128(%rsp,%rax), %r14 +; AVX512F-NEXT: movq -72(%rsp,%rax), %r15 +; AVX512F-NEXT: shrdq %cl, %r15, %rbx +; AVX512F-NEXT: shrdq %cl, %r9, %r14 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: sarxq %rcx, %r15, %rcx +; AVX512F-NEXT: movq %rcx, 56(%rdi) +; AVX512F-NEXT: movq %rbx, 48(%rdi) +; AVX512F-NEXT: movq %r11, 40(%rdi) +; AVX512F-NEXT: movq %r10, 32(%rdi) +; AVX512F-NEXT: movq %r8, 24(%rdi) +; AVX512F-NEXT: movq %rdx, 16(%rdi) +; AVX512F-NEXT: movq %rsi, 8(%rdi) +; AVX512F-NEXT: movq %r14, (%rdi) +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: popq %r15 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: ashr_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r15 +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: sarq $63, %r10 +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %eax +; AVX512VL-NEXT: andl $56, %eax +; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512VL-NEXT: movq %r9, %rsi +; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi +; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512VL-NEXT: shrdq %cl, %r8, %rdx +; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512VL-NEXT: shrdq %cl, %r10, %r8 +; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512VL-NEXT: shrdq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512VL-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14 +; AVX512VL-NEXT: shrdq %cl, %r14, %rbx +; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15 +; AVX512VL-NEXT: shrdq %cl, %r9, %r15 +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx +; AVX512VL-NEXT: movq %rcx, 56(%rdi) +; AVX512VL-NEXT: movq %rbx, 48(%rdi) +; AVX512VL-NEXT: movq %r11, 40(%rdi) +; AVX512VL-NEXT: movq %r10, 32(%rdi) +; AVX512VL-NEXT: movq %r8, 24(%rdi) +; AVX512VL-NEXT: movq %rdx, 16(%rdi) +; AVX512VL-NEXT: movq %rsi, 8(%rdi) +; AVX512VL-NEXT: movq %r15, (%rdi) +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: popq %r15 +; AVX512VL-NEXT: retq ; -; ZNVER4-LABEL: shl_i512_1: -; ZNVER4: # %bb.0: -; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; ZNVER4-NEXT: vpaddq %xmm0, %xmm0, %xmm4 -; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 -; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; ZNVER4-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; ZNVER4-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1 -; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; ZNVER4-NEXT: vpshldq $1, %zmm0, %zmm3, %zmm0 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; ZNVER4-NEXT: retq - %d = bitcast <8 x i64> %a to i512 - %s = shl i512 %d, 1 - %r = bitcast i512 %s to <8 x i64> - ret <8 x i64> %r +; AVX512VBMI-LABEL: ashr_i512: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: pushq %r15 +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: sarq $63, %r10 +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %eax, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %eax +; AVX512VBMI-NEXT: andl $56, %eax +; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx +; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9 +; AVX512VBMI-NEXT: movq %r9, %rsi +; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi +; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8 +; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx +; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10 +; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8 +; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11 +; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx +; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14 +; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx +; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15 +; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15 +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx +; AVX512VBMI-NEXT: movq %rcx, 56(%rdi) +; AVX512VBMI-NEXT: movq %rbx, 48(%rdi) +; AVX512VBMI-NEXT: movq %r11, 40(%rdi) +; AVX512VBMI-NEXT: movq %r10, 32(%rdi) +; AVX512VBMI-NEXT: movq %r8, 24(%rdi) +; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) +; AVX512VBMI-NEXT: movq %rsi, 8(%rdi) +; AVX512VBMI-NEXT: movq %r15, (%rdi) +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: popq %r15 +; AVX512VBMI-NEXT: retq + %r = ashr i512 %a0, %a1 + ret i512 %r +} + +define i512 @shl_i512_1(i512 %a0) nounwind { +; CHECK-LABEL: shl_i512_1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK-NEXT: shldq $1, %rdi, %r10 +; CHECK-NEXT: shldq $1, %r11, %rdi +; CHECK-NEXT: shldq $1, %r9, %r11 +; CHECK-NEXT: shldq $1, %r8, %r9 +; CHECK-NEXT: shldq $1, %rcx, %r8 +; CHECK-NEXT: shldq $1, %rdx, %rcx +; CHECK-NEXT: shldq $1, %rsi, %rdx +; CHECK-NEXT: addq %rsi, %rsi +; CHECK-NEXT: movq %r10, 56(%rax) +; CHECK-NEXT: movq %rdi, 48(%rax) +; CHECK-NEXT: movq %r11, 40(%rax) +; CHECK-NEXT: movq %r9, 32(%rax) +; CHECK-NEXT: movq %r8, 24(%rax) +; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %rdx, 8(%rax) +; CHECK-NEXT: movq %rsi, (%rax) +; CHECK-NEXT: retq + %r = shl i512 %a0, 1 + ret i512 %r +} + +define i512 @lshr_i512_1(i512 %a0) nounwind { +; CHECK-LABEL: lshr_i512_1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK-NEXT: shrdq $1, %rdx, %rsi +; CHECK-NEXT: shrdq $1, %rcx, %rdx +; CHECK-NEXT: shrdq $1, %r8, %rcx +; CHECK-NEXT: shrdq $1, %r9, %r8 +; CHECK-NEXT: shrdq $1, %r11, %r9 +; CHECK-NEXT: shrdq $1, %rdi, %r11 +; CHECK-NEXT: shrdq $1, %r10, %rdi +; CHECK-NEXT: shrq %r10 +; CHECK-NEXT: movq %r10, 56(%rax) +; CHECK-NEXT: movq %rdi, 48(%rax) +; CHECK-NEXT: movq %r11, 40(%rax) +; CHECK-NEXT: movq %r9, 32(%rax) +; CHECK-NEXT: movq %r8, 24(%rax) +; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %rdx, 8(%rax) +; CHECK-NEXT: movq %rsi, (%rax) +; CHECK-NEXT: retq + %r = lshr i512 %a0, 1 + ret i512 %r +} + +define i512 @ashr_i512_1(i512 %a0) nounwind { +; CHECK-LABEL: ashr_i512_1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK-NEXT: shrdq $1, %rdx, %rsi +; CHECK-NEXT: shrdq $1, %rcx, %rdx +; CHECK-NEXT: shrdq $1, %r8, %rcx +; CHECK-NEXT: shrdq $1, %r9, %r8 +; CHECK-NEXT: shrdq $1, %r11, %r9 +; CHECK-NEXT: shrdq $1, %rdi, %r11 +; CHECK-NEXT: shrdq $1, %r10, %rdi +; CHECK-NEXT: sarq %r10 +; CHECK-NEXT: movq %r10, 56(%rax) +; CHECK-NEXT: movq %rdi, 48(%rax) +; CHECK-NEXT: movq %r11, 40(%rax) +; CHECK-NEXT: movq %r9, 32(%rax) +; CHECK-NEXT: movq %r8, 24(%rax) +; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %rdx, 8(%rax) +; CHECK-NEXT: movq %rsi, (%rax) +; CHECK-NEXT: retq + %r = ashr i512 %a0, 1 + ret i512 %r +} + +define i512 @shl_i512_200(i512 %a0) nounwind { +; SSE-LABEL: shl_i512_200: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shldq $8, %r8, %r9 +; SSE-NEXT: shldq $8, %rcx, %r8 +; SSE-NEXT: shldq $8, %rdx, %rcx +; SSE-NEXT: shldq $8, %rsi, %rdx +; SSE-NEXT: shlq $8, %rsi +; SSE-NEXT: movq %r9, 56(%rdi) +; SSE-NEXT: movq %r8, 48(%rdi) +; SSE-NEXT: movq %rcx, 40(%rdi) +; SSE-NEXT: movq %rdx, 32(%rdi) +; SSE-NEXT: movq %rsi, 24(%rdi) +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: movq $0, 16(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: shl_i512_200: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shldq $8, %r8, %r9 +; AVX2-NEXT: shldq $8, %rcx, %r8 +; AVX2-NEXT: shldq $8, %rdx, %rcx +; AVX2-NEXT: shldq $8, %rsi, %rdx +; AVX2-NEXT: shlq $8, %rsi +; AVX2-NEXT: movq %r9, 56(%rdi) +; AVX2-NEXT: movq %r8, 48(%rdi) +; AVX2-NEXT: movq %rcx, 40(%rdi) +; AVX2-NEXT: movq %rdx, 32(%rdi) +; AVX2-NEXT: movq %rsi, 24(%rdi) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: movq $0, 16(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: shl_i512_200: +; AVX512: # %bb.0: +; AVX512-NEXT: movq %rdi, %rax +; AVX512-NEXT: shldq $8, %r8, %r9 +; AVX512-NEXT: shldq $8, %rcx, %r8 +; AVX512-NEXT: shldq $8, %rdx, %rcx +; AVX512-NEXT: shldq $8, %rsi, %rdx +; AVX512-NEXT: shlq $8, %rsi +; AVX512-NEXT: movq %r9, 56(%rdi) +; AVX512-NEXT: movq %r8, 48(%rdi) +; AVX512-NEXT: movq %rcx, 40(%rdi) +; AVX512-NEXT: movq %rdx, 32(%rdi) +; AVX512-NEXT: movq %rsi, 24(%rdi) +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rdi) +; AVX512-NEXT: movq $0, 16(%rdi) +; AVX512-NEXT: retq + %r = shl i512 %a0, 200 + ret i512 %r +} + +define i512 @lshr_i512_200(i512 %a0) nounwind { +; SSE-LABEL: lshr_i512_200: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE-NEXT: shrdq $8, %r9, %r8 +; SSE-NEXT: shrdq $8, %rsi, %r9 +; SSE-NEXT: shrdq $8, %rcx, %rsi +; SSE-NEXT: shrdq $8, %rdx, %rcx +; SSE-NEXT: shrq $8, %rdx +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, 40(%rdi) +; SSE-NEXT: movq %rdx, 32(%rdi) +; SSE-NEXT: movq %rcx, 24(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %r9, 8(%rdi) +; SSE-NEXT: movq %r8, (%rdi) +; SSE-NEXT: movq $0, 56(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: lshr_i512_200: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX2-NEXT: shrdq $8, %r9, %r8 +; AVX2-NEXT: shrdq $8, %rsi, %r9 +; AVX2-NEXT: shrdq $8, %rcx, %rsi +; AVX2-NEXT: shrdq $8, %rdx, %rcx +; AVX2-NEXT: shrq $8, %rdx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %xmm0, 40(%rdi) +; AVX2-NEXT: movq %rdx, 32(%rdi) +; AVX2-NEXT: movq %rcx, 24(%rdi) +; AVX2-NEXT: movq %rsi, 16(%rdi) +; AVX2-NEXT: movq %r9, 8(%rdi) +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: movq $0, 56(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: lshr_i512_200: +; AVX512: # %bb.0: +; AVX512-NEXT: movq %rdi, %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX512-NEXT: shrdq $8, %r9, %r8 +; AVX512-NEXT: shrdq $8, %rsi, %r9 +; AVX512-NEXT: shrdq $8, %rcx, %rsi +; AVX512-NEXT: shrdq $8, %rdx, %rcx +; AVX512-NEXT: shrq $8, %rdx +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %xmm0, 40(%rdi) +; AVX512-NEXT: movq %rdx, 32(%rdi) +; AVX512-NEXT: movq %rcx, 24(%rdi) +; AVX512-NEXT: movq %rsi, 16(%rdi) +; AVX512-NEXT: movq %r9, 8(%rdi) +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq $0, 56(%rdi) +; AVX512-NEXT: retq + %r = lshr i512 %a0, 200 + ret i512 %r +} + +define i512 @ashr_i512_200(i512 %a0) nounwind { +; CHECK-LABEL: ashr_i512_200: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: shrdq $8, %r9, %r8 +; CHECK-NEXT: shrdq $8, %rsi, %r9 +; CHECK-NEXT: shrdq $8, %rcx, %rsi +; CHECK-NEXT: shrdq $8, %rdx, %rcx +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: sarq $8, %rdi +; CHECK-NEXT: sarq $63, %rdx +; CHECK-NEXT: movq %rdx, 56(%rax) +; CHECK-NEXT: movq %rdx, 48(%rax) +; CHECK-NEXT: movq %rdx, 40(%rax) +; CHECK-NEXT: movq %rdi, 32(%rax) +; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: movq %rsi, 16(%rax) +; CHECK-NEXT: movq %r9, 8(%rax) +; CHECK-NEXT: movq %r8, (%rax) +; CHECK-NEXT: retq + %r = ashr i512 %a0, 200 + ret i512 %r } -define <8 x i64> @lshr_i512_1(<8 x i64> %a) { -; AVX512VL-LABEL: lshr_i512_1: +define i512 @shl_i512_511(i512 %a0) nounwind { +; SSE-LABEL: shl_i512_511: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: shlq $63, %rsi +; SSE-NEXT: movq %rsi, 56(%rdi) +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, 32(%rdi) +; SSE-NEXT: movaps %xmm0, 16(%rdi) +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: movq $0, 48(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: shl_i512_511: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shlq $63, %rsi +; AVX2-NEXT: movq %rsi, 56(%rdi) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: movq $0, 48(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: shl_i512_511: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: shlq $63, %rsi +; AVX512F-NEXT: movq %rsi, 56(%rdi) +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %ymm0, (%rdi) +; AVX512F-NEXT: movq $0, 48(%rdi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shl_i512_511: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: shlq $63, %rsi +; AVX512VL-NEXT: movq %rsi, 56(%rdi) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, (%rdi) +; AVX512VL-NEXT: movq $0, 48(%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VBMI-LABEL: shl_i512_511: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: shlq $63, %rsi +; AVX512VBMI-NEXT: movq %rsi, 56(%rdi) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, (%rdi) +; AVX512VBMI-NEXT: movq $0, 48(%rdi) +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq + %r = shl i512 %a0, 511 + ret i512 %r +} + +define i512 @lshr_i512_511(i512 %a0) nounwind { +; SSE-LABEL: lshr_i512_511: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: shrq $63, %rcx +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, 40(%rdi) +; SSE-NEXT: movups %xmm0, 24(%rdi) +; SSE-NEXT: movups %xmm0, 8(%rdi) +; SSE-NEXT: movq %rcx, (%rdi) +; SSE-NEXT: movq $0, 56(%rdi) +; SSE-NEXT: retq +; +; AVX2-LABEL: lshr_i512_511: +; AVX2: # %bb.0: +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: shrq $63, %rcx +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %xmm0, 40(%rdi) +; AVX2-NEXT: movq %rcx, (%rdi) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, 8(%rdi) +; AVX2-NEXT: movq $0, 56(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: lshr_i512_511: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512F-NEXT: shrq $63, %rcx +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %xmm0, 40(%rdi) +; AVX512F-NEXT: movq %rcx, (%rdi) +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %ymm0, 8(%rdi) +; AVX512F-NEXT: movq $0, 56(%rdi) +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: lshr_i512_511: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpsllq $63, %xmm3, %xmm4 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX512VL-NEXT: vpsrlq $1, %xmm5, %xmm5 -; AVX512VL-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX512VL-NEXT: vpsrlq $1, %xmm3, %xmm3 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsllq $63, %ymm1, %ymm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VL-NEXT: vpsrlq $1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %xmm0, 40(%rdi) +; AVX512VL-NEXT: shrq $63, %rcx +; AVX512VL-NEXT: movq %rcx, (%rdi) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, 8(%rdi) +; AVX512VL-NEXT: movq $0, 56(%rdi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VBMI-LABEL: lshr_i512_1: +; AVX512VBMI-LABEL: lshr_i512_511: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512VBMI-NEXT: vpsrlq $1, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %xmm0, 40(%rdi) +; AVX512VBMI-NEXT: shrq $63, %rcx +; AVX512VBMI-NEXT: movq %rcx, (%rdi) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, 8(%rdi) +; AVX512VBMI-NEXT: movq $0, 56(%rdi) +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq + %r = lshr i512 %a0, 511 + ret i512 %r +} + +define i512 @ashr_i512_511(i512 %a0) nounwind { +; CHECK-LABEL: ashr_i512_511: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: sarq $63, %rcx +; CHECK-NEXT: movq %rcx, 56(%rdi) +; CHECK-NEXT: movq %rcx, 48(%rdi) +; CHECK-NEXT: movq %rcx, 40(%rdi) +; CHECK-NEXT: movq %rcx, 32(%rdi) +; CHECK-NEXT: movq %rcx, 24(%rdi) +; CHECK-NEXT: movq %rcx, 16(%rdi) +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: retq + %r = ashr i512 %a0, 511 + ret i512 %r +} + +define i512 @shl_1_i512(i512 %a0) nounwind { +; SSE-LABEL: shl_1_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: negl %esi +; SSE-NEXT: movslq %esi, %rax +; SSE-NEXT: movq -56(%rsp,%rax), %rdx +; SSE-NEXT: movq -48(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %rsi +; SSE-NEXT: shldq %cl, %rdx, %rsi +; SSE-NEXT: movq -40(%rsp,%rax), %r10 +; SSE-NEXT: movq %r10, %r8 +; SSE-NEXT: shldq %cl, %r9, %r8 +; SSE-NEXT: movq -32(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %r11 +; SSE-NEXT: shldq %cl, %r10, %r11 +; SSE-NEXT: movq -24(%rsp,%rax), %r10 +; SSE-NEXT: movq %r10, %rbx +; SSE-NEXT: shldq %cl, %r9, %rbx +; SSE-NEXT: movq -16(%rsp,%rax), %r9 +; SSE-NEXT: movq %r9, %r14 +; SSE-NEXT: shldq %cl, %r10, %r14 +; SSE-NEXT: movq -8(%rsp,%rax), %r10 +; SSE-NEXT: shldq %cl, %r9, %r10 +; SSE-NEXT: movq -64(%rsp,%rax), %rax +; SSE-NEXT: movq %rax, %r9 +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shldq %cl, %rax, %rdx +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movq %r10, 56(%rdi) +; SSE-NEXT: movq %r14, 48(%rdi) +; SSE-NEXT: movq %rbx, 40(%rdi) +; SSE-NEXT: movq %r11, 32(%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rsi, 16(%rdi) +; SSE-NEXT: movq %rdx, 8(%rdi) +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: shl_1_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: negl %esi +; AVX2-NEXT: movslq %esi, %r8 +; AVX2-NEXT: movq -56(%rsp,%r8), %rdx +; AVX2-NEXT: movq -48(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: shldq %cl, %rdx, %rsi +; AVX2-NEXT: movq -40(%rsp,%r8), %r10 +; AVX2-NEXT: movq %r10, %r9 +; AVX2-NEXT: shldq %cl, %rax, %r9 +; AVX2-NEXT: movq -32(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: movq -24(%rsp,%r8), %r10 +; AVX2-NEXT: movq %r10, %rbx +; AVX2-NEXT: shldq %cl, %rax, %rbx +; AVX2-NEXT: movq -16(%rsp,%r8), %rax +; AVX2-NEXT: movq %rax, %r14 +; AVX2-NEXT: shldq %cl, %r10, %r14 +; AVX2-NEXT: movq -8(%rsp,%r8), %r10 +; AVX2-NEXT: shldq %cl, %rax, %r10 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: movq -64(%rsp,%r8), %rdi +; AVX2-NEXT: shlxq %rcx, %rdi, %r8 +; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT: shldq %cl, %rdi, %rdx +; AVX2-NEXT: movq %r10, 56(%rax) +; AVX2-NEXT: movq %r14, 48(%rax) +; AVX2-NEXT: movq %rbx, 40(%rax) +; AVX2-NEXT: movq %r11, 32(%rax) +; AVX2-NEXT: movq %r9, 24(%rax) +; AVX2-NEXT: movq %rsi, 16(%rax) +; AVX2-NEXT: movq %rdx, 8(%rax) +; AVX2-NEXT: movq %r8, (%rax) +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; ZNVER4-LABEL: lshr_i512_1: -; ZNVER4: # %bb.0: -; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1 -; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; ZNVER4-NEXT: vpsrlq $1, %xmm2, %xmm2 -; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0 -; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; ZNVER4-NEXT: retq - %d = bitcast <8 x i64> %a to i512 - %s = lshr i512 %d, 1 - %r = bitcast i512 %s to <8 x i64> - ret <8 x i64> %r +; AVX512F-LABEL: shl_1_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %esi, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %esi +; AVX512F-NEXT: andl $56, %esi +; AVX512F-NEXT: negl %esi +; AVX512F-NEXT: movslq %esi, %r8 +; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx +; AVX512F-NEXT: movq -48(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: shldq %cl, %rdx, %rsi +; AVX512F-NEXT: movq -40(%rsp,%r8), %r10 +; AVX512F-NEXT: movq %r10, %r9 +; AVX512F-NEXT: shldq %cl, %rax, %r9 +; AVX512F-NEXT: movq -32(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: shldq %cl, %r10, %r11 +; AVX512F-NEXT: movq -24(%rsp,%r8), %r10 +; AVX512F-NEXT: movq %r10, %rbx +; AVX512F-NEXT: shldq %cl, %rax, %rbx +; AVX512F-NEXT: movq -16(%rsp,%r8), %rax +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: shldq %cl, %r10, %r14 +; AVX512F-NEXT: movq -8(%rsp,%r8), %r10 +; AVX512F-NEXT: shldq %cl, %rax, %r10 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi +; AVX512F-NEXT: shlxq %rcx, %rdi, %r8 +; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512F-NEXT: shldq %cl, %rdi, %rdx +; AVX512F-NEXT: movq %r10, 56(%rax) +; AVX512F-NEXT: movq %r14, 48(%rax) +; AVX512F-NEXT: movq %rbx, 40(%rax) +; AVX512F-NEXT: movq %r11, 32(%rax) +; AVX512F-NEXT: movq %r9, 24(%rax) +; AVX512F-NEXT: movq %rsi, 16(%rax) +; AVX512F-NEXT: movq %rdx, 8(%rax) +; AVX512F-NEXT: movq %r8, (%rax) +; AVX512F-NEXT: addq $8, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shl_1_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r15 +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %esi, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %esi +; AVX512VL-NEXT: andl $56, %esi +; AVX512VL-NEXT: negl %esi +; AVX512VL-NEXT: movslq %esi, %r9 +; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx +; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax +; AVX512VL-NEXT: movq %rax, %rsi +; AVX512VL-NEXT: shldq %cl, %rdx, %rsi +; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10 +; AVX512VL-NEXT: movq %r10, %r8 +; AVX512VL-NEXT: shldq %cl, %rax, %r8 +; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11 +; AVX512VL-NEXT: movq %r11, %rbx +; AVX512VL-NEXT: shldq %cl, %r10, %rbx +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi +; AVX512VL-NEXT: movq %rdi, %r10 +; AVX512VL-NEXT: shldq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11 +; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14 +; AVX512VL-NEXT: movq %r14, %r15 +; AVX512VL-NEXT: shldq %cl, %rdi, %r15 +; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi +; AVX512VL-NEXT: shldq %cl, %r14, %rdi +; AVX512VL-NEXT: shlxq %rcx, %r11, %r9 +; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512VL-NEXT: shldq %cl, %r11, %rdx +; AVX512VL-NEXT: movq %rdi, 56(%rax) +; AVX512VL-NEXT: movq %r15, 48(%rax) +; AVX512VL-NEXT: movq %r10, 40(%rax) +; AVX512VL-NEXT: movq %rbx, 32(%rax) +; AVX512VL-NEXT: movq %r8, 24(%rax) +; AVX512VL-NEXT: movq %rsi, 16(%rax) +; AVX512VL-NEXT: movq %rdx, 8(%rax) +; AVX512VL-NEXT: movq %r9, (%rax) +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: popq %r15 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VBMI-LABEL: shl_1_i512: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: pushq %r15 +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %esi, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %esi +; AVX512VBMI-NEXT: andl $56, %esi +; AVX512VBMI-NEXT: negl %esi +; AVX512VBMI-NEXT: movslq %esi, %r9 +; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx +; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax +; AVX512VBMI-NEXT: movq %rax, %rsi +; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi +; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10 +; AVX512VBMI-NEXT: movq %r10, %r8 +; AVX512VBMI-NEXT: shldq %cl, %rax, %r8 +; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11 +; AVX512VBMI-NEXT: movq %r11, %rbx +; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi +; AVX512VBMI-NEXT: movq %rdi, %r10 +; AVX512VBMI-NEXT: shldq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11 +; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14 +; AVX512VBMI-NEXT: movq %r14, %r15 +; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15 +; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi +; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi +; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9 +; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx +; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx +; AVX512VBMI-NEXT: movq %rdi, 56(%rax) +; AVX512VBMI-NEXT: movq %r15, 48(%rax) +; AVX512VBMI-NEXT: movq %r10, 40(%rax) +; AVX512VBMI-NEXT: movq %rbx, 32(%rax) +; AVX512VBMI-NEXT: movq %r8, 24(%rax) +; AVX512VBMI-NEXT: movq %rsi, 16(%rax) +; AVX512VBMI-NEXT: movq %rdx, 8(%rax) +; AVX512VBMI-NEXT: movq %r9, (%rax) +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: popq %r15 +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq + %r = shl i512 1, %a0 + ret i512 %r } -define <8 x i64> @ashr_i512_1(<8 x i64> %a) { -; AVX512VL-LABEL: ashr_i512_1: +define i512 @lshr_signbit_i512(i512 %a0) nounwind { +; SSE-LABEL: lshr_signbit_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: movq -112(%rsp,%rsi), %rdx +; SSE-NEXT: movq -120(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: shrdq %cl, %rdx, %r8 +; SSE-NEXT: movq -104(%rsp,%rsi), %r9 +; SSE-NEXT: shrdq %cl, %r9, %rdx +; SSE-NEXT: movq -96(%rsp,%rsi), %r10 +; SSE-NEXT: shrdq %cl, %r10, %r9 +; SSE-NEXT: movq -88(%rsp,%rsi), %r11 +; SSE-NEXT: shrdq %cl, %r11, %r10 +; SSE-NEXT: movq -80(%rsp,%rsi), %rbx +; SSE-NEXT: shrdq %cl, %rbx, %r11 +; SSE-NEXT: movq -72(%rsp,%rsi), %r14 +; SSE-NEXT: shrdq %cl, %r14, %rbx +; SSE-NEXT: movq -128(%rsp,%rsi), %rsi +; SSE-NEXT: shrdq %cl, %rax, %rsi +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shrq %cl, %r14 +; SSE-NEXT: movq %r14, 56(%rdi) +; SSE-NEXT: movq %rbx, 48(%rdi) +; SSE-NEXT: movq %r11, 40(%rdi) +; SSE-NEXT: movq %r10, 32(%rdi) +; SSE-NEXT: movq %r9, 24(%rdi) +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: lshr_signbit_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808] +; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX2-NEXT: movq -120(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: shrdq %cl, %rdx, %r8 +; AVX2-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX2-NEXT: shrdq %cl, %r9, %rdx +; AVX2-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX2-NEXT: shrdq %cl, %r10, %r9 +; AVX2-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX2-NEXT: shrdq %cl, %r11, %r10 +; AVX2-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX2-NEXT: shrdq %cl, %rbx, %r11 +; AVX2-NEXT: movq -128(%rsp,%rsi), %r14 +; AVX2-NEXT: movq -72(%rsp,%rsi), %rsi +; AVX2-NEXT: shrdq %cl, %rsi, %rbx +; AVX2-NEXT: shrdq %cl, %rax, %r14 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrxq %rcx, %rsi, %rcx +; AVX2-NEXT: movq %rcx, 56(%rdi) +; AVX2-NEXT: movq %rbx, 48(%rdi) +; AVX2-NEXT: movq %r11, 40(%rdi) +; AVX2-NEXT: movq %r10, 32(%rdi) +; AVX2-NEXT: movq %r9, 24(%rdi) +; AVX2-NEXT: movq %rdx, 16(%rdi) +; AVX2-NEXT: movq %r8, 8(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: lshr_signbit_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808] +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %esi, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %esi +; AVX512F-NEXT: andl $56, %esi +; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: shrdq %cl, %rdx, %r8 +; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512F-NEXT: shrdq %cl, %r9, %rdx +; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512F-NEXT: shrdq %cl, %r10, %r9 +; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512F-NEXT: shrdq %cl, %r11, %r10 +; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512F-NEXT: shrdq %cl, %rbx, %r11 +; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14 +; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi +; AVX512F-NEXT: shrdq %cl, %rsi, %rbx +; AVX512F-NEXT: shrdq %cl, %rax, %r14 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx +; AVX512F-NEXT: movq %rcx, 56(%rdi) +; AVX512F-NEXT: movq %rbx, 48(%rdi) +; AVX512F-NEXT: movq %r11, 40(%rdi) +; AVX512F-NEXT: movq %r10, 32(%rdi) +; AVX512F-NEXT: movq %r9, 24(%rdi) +; AVX512F-NEXT: movq %rdx, 16(%rdi) +; AVX512F-NEXT: movq %r8, 8(%rdi) +; AVX512F-NEXT: movq %r14, (%rdi) +; AVX512F-NEXT: addq $8, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: lshr_signbit_i512: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpsllq $63, %xmm3, %xmm4 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX512VL-NEXT: vpsrlq $1, %xmm5, %xmm5 -; AVX512VL-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX512VL-NEXT: vpsraq $1, %xmm3, %xmm3 -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsllq $63, %ymm1, %ymm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VL-NEXT: vpsrlq $1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: pushq %rax +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808] +; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %esi, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %esi +; AVX512VL-NEXT: andl $56, %esi +; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512VL-NEXT: movq %rax, %r8 +; AVX512VL-NEXT: shrdq %cl, %rdx, %r8 +; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512VL-NEXT: shrdq %cl, %r9, %rdx +; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512VL-NEXT: shrdq %cl, %r10, %r9 +; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512VL-NEXT: shrdq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512VL-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14 +; AVX512VL-NEXT: shrdq %cl, %r14, %rbx +; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi +; AVX512VL-NEXT: shrdq %cl, %rax, %rsi +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx +; AVX512VL-NEXT: movq %rcx, 56(%rdi) +; AVX512VL-NEXT: movq %rbx, 48(%rdi) +; AVX512VL-NEXT: movq %r11, 40(%rdi) +; AVX512VL-NEXT: movq %r10, 32(%rdi) +; AVX512VL-NEXT: movq %r9, 24(%rdi) +; AVX512VL-NEXT: movq %rdx, 16(%rdi) +; AVX512VL-NEXT: movq %r8, 8(%rdi) +; AVX512VL-NEXT: movq %rsi, (%rdi) +; AVX512VL-NEXT: addq $8, %rsp +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; -; AVX512VBMI-LABEL: ashr_i512_1: +; AVX512VBMI-LABEL: lshr_signbit_i512: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512VBMI-NEXT: vpsraq $1, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: pushq %rax +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808] +; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %esi, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %esi +; AVX512VBMI-NEXT: andl $56, %esi +; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512VBMI-NEXT: movq %rax, %r8 +; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8 +; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx +; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9 +; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14 +; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx +; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi +; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx +; AVX512VBMI-NEXT: movq %rcx, 56(%rdi) +; AVX512VBMI-NEXT: movq %rbx, 48(%rdi) +; AVX512VBMI-NEXT: movq %r11, 40(%rdi) +; AVX512VBMI-NEXT: movq %r10, 32(%rdi) +; AVX512VBMI-NEXT: movq %r9, 24(%rdi) +; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) +; AVX512VBMI-NEXT: movq %r8, 8(%rdi) +; AVX512VBMI-NEXT: movq %rsi, (%rdi) +; AVX512VBMI-NEXT: addq $8, %rsp +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq + %s = shl i512 1, 511 + %r = lshr i512 %s, %a0 + ret i512 %r +} + +define i512 @ashr_signbit_i512(i512 %a0) nounwind { +; SSE-LABEL: ashr_signbit_i512: +; SSE: # %bb.0: +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: pushq %rax +; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl %esi, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $56, %esi +; SSE-NEXT: movq -112(%rsp,%rsi), %rdx +; SSE-NEXT: movq -120(%rsp,%rsi), %rax +; SSE-NEXT: movq %rax, %r8 +; SSE-NEXT: shrdq %cl, %rdx, %r8 +; SSE-NEXT: movq -104(%rsp,%rsi), %r9 +; SSE-NEXT: shrdq %cl, %r9, %rdx +; SSE-NEXT: movq -96(%rsp,%rsi), %r10 +; SSE-NEXT: shrdq %cl, %r10, %r9 +; SSE-NEXT: movq -88(%rsp,%rsi), %r11 +; SSE-NEXT: shrdq %cl, %r11, %r10 +; SSE-NEXT: movq -80(%rsp,%rsi), %rbx +; SSE-NEXT: shrdq %cl, %rbx, %r11 +; SSE-NEXT: movq -72(%rsp,%rsi), %r14 +; SSE-NEXT: shrdq %cl, %r14, %rbx +; SSE-NEXT: movq -128(%rsp,%rsi), %rsi +; SSE-NEXT: shrdq %cl, %rax, %rsi +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: sarq %cl, %r14 +; SSE-NEXT: movq %r14, 56(%rdi) +; SSE-NEXT: movq %rbx, 48(%rdi) +; SSE-NEXT: movq %r11, 40(%rdi) +; SSE-NEXT: movq %r10, 32(%rdi) +; SSE-NEXT: movq %r9, 24(%rdi) +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX2-LABEL: ashr_signbit_i512: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808] +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: andl $63, %ecx +; AVX2-NEXT: shrl $3, %esi +; AVX2-NEXT: andl $56, %esi +; AVX2-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX2-NEXT: movq -120(%rsp,%rsi), %rax +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: shrdq %cl, %rdx, %r8 +; AVX2-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX2-NEXT: shrdq %cl, %r9, %rdx +; AVX2-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX2-NEXT: shrdq %cl, %r10, %r9 +; AVX2-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX2-NEXT: shrdq %cl, %r11, %r10 +; AVX2-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX2-NEXT: shrdq %cl, %rbx, %r11 +; AVX2-NEXT: movq -128(%rsp,%rsi), %r14 +; AVX2-NEXT: movq -72(%rsp,%rsi), %rsi +; AVX2-NEXT: shrdq %cl, %rsi, %rbx +; AVX2-NEXT: shrdq %cl, %rax, %r14 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: sarxq %rcx, %rsi, %rcx +; AVX2-NEXT: movq %rcx, 56(%rdi) +; AVX2-NEXT: movq %rbx, 48(%rdi) +; AVX2-NEXT: movq %r11, 40(%rdi) +; AVX2-NEXT: movq %r10, 32(%rdi) +; AVX2-NEXT: movq %r9, 24(%rdi) +; AVX2-NEXT: movq %rdx, 16(%rdi) +; AVX2-NEXT: movq %r8, 8(%rdi) +; AVX2-NEXT: movq %r14, (%rdi) +; AVX2-NEXT: addq $8, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; ZNVER4-LABEL: ashr_i512_1: -; ZNVER4: # %bb.0: -; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1 -; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4 -; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1 -; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] -; ZNVER4-NEXT: vpsraq $1, %xmm2, %xmm2 -; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0 -; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; ZNVER4-NEXT: retq - %d = bitcast <8 x i64> %a to i512 - %s = ashr i512 %d, 1 - %r = bitcast i512 %s to <8 x i64> - ret <8 x i64> %r +; AVX512F-LABEL: ashr_signbit_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: pushq %rax +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1 +; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808] +; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl %esi, %ecx +; AVX512F-NEXT: andl $63, %ecx +; AVX512F-NEXT: shrl $3, %esi +; AVX512F-NEXT: andl $56, %esi +; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: shrdq %cl, %rdx, %r8 +; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512F-NEXT: shrdq %cl, %r9, %rdx +; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512F-NEXT: shrdq %cl, %r10, %r9 +; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512F-NEXT: shrdq %cl, %r11, %r10 +; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512F-NEXT: shrdq %cl, %rbx, %r11 +; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14 +; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi +; AVX512F-NEXT: shrdq %cl, %rsi, %rbx +; AVX512F-NEXT: shrdq %cl, %rax, %r14 +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: sarxq %rcx, %rsi, %rcx +; AVX512F-NEXT: movq %rcx, 56(%rdi) +; AVX512F-NEXT: movq %rbx, 48(%rdi) +; AVX512F-NEXT: movq %r11, 40(%rdi) +; AVX512F-NEXT: movq %r10, 32(%rdi) +; AVX512F-NEXT: movq %r9, 24(%rdi) +; AVX512F-NEXT: movq %rdx, 16(%rdi) +; AVX512F-NEXT: movq %r8, 8(%rdi) +; AVX512F-NEXT: movq %r14, (%rdi) +; AVX512F-NEXT: addq $8, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: ashr_signbit_i512: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %r14 +; AVX512VL-NEXT: pushq %rbx +; AVX512VL-NEXT: pushq %rax +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808] +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VL-NEXT: movl %esi, %ecx +; AVX512VL-NEXT: andl $63, %ecx +; AVX512VL-NEXT: shrl $3, %esi +; AVX512VL-NEXT: andl $56, %esi +; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512VL-NEXT: movq %rax, %r8 +; AVX512VL-NEXT: shrdq %cl, %rdx, %r8 +; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512VL-NEXT: shrdq %cl, %r9, %rdx +; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512VL-NEXT: shrdq %cl, %r10, %r9 +; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512VL-NEXT: shrdq %cl, %r11, %r10 +; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512VL-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14 +; AVX512VL-NEXT: shrdq %cl, %r14, %rbx +; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi +; AVX512VL-NEXT: shrdq %cl, %rax, %rsi +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx +; AVX512VL-NEXT: movq %rcx, 56(%rdi) +; AVX512VL-NEXT: movq %rbx, 48(%rdi) +; AVX512VL-NEXT: movq %r11, 40(%rdi) +; AVX512VL-NEXT: movq %r10, 32(%rdi) +; AVX512VL-NEXT: movq %r9, 24(%rdi) +; AVX512VL-NEXT: movq %rdx, 16(%rdi) +; AVX512VL-NEXT: movq %r8, 8(%rdi) +; AVX512VL-NEXT: movq %rsi, (%rdi) +; AVX512VL-NEXT: addq $8, %rsp +; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: popq %r14 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512VBMI-LABEL: ashr_signbit_i512: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: pushq %r14 +; AVX512VBMI-NEXT: pushq %rbx +; AVX512VBMI-NEXT: pushq %rax +; AVX512VBMI-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808] +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512VBMI-NEXT: movl %esi, %ecx +; AVX512VBMI-NEXT: andl $63, %ecx +; AVX512VBMI-NEXT: shrl $3, %esi +; AVX512VBMI-NEXT: andl $56, %esi +; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx +; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax +; AVX512VBMI-NEXT: movq %rax, %r8 +; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8 +; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9 +; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx +; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10 +; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9 +; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11 +; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10 +; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx +; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11 +; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14 +; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx +; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi +; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi +; AVX512VBMI-NEXT: movq %rdi, %rax +; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx +; AVX512VBMI-NEXT: movq %rcx, 56(%rdi) +; AVX512VBMI-NEXT: movq %rbx, 48(%rdi) +; AVX512VBMI-NEXT: movq %r11, 40(%rdi) +; AVX512VBMI-NEXT: movq %r10, 32(%rdi) +; AVX512VBMI-NEXT: movq %r9, 24(%rdi) +; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) +; AVX512VBMI-NEXT: movq %r8, 8(%rdi) +; AVX512VBMI-NEXT: movq %rsi, (%rdi) +; AVX512VBMI-NEXT: addq $8, %rsp +; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: popq %r14 +; AVX512VBMI-NEXT: vzeroupper +; AVX512VBMI-NEXT: retq + %s = shl i512 1, 511 + %r = ashr i512 %s, %a0 + ret i512 %r } diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index ce56283..8cb0327 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movslq %edi, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movslq %esi, %rcx +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll index 18588aa..fade0f7 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -fp-contract=fast < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s declare float @llvm.sqrt.f32(float) #2 @@ -24,17 +24,17 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf contract afn VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr - ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] ; CHECK-NEXT: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool) @@ -46,7 +46,7 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]] ; CHECK-NEXT: $xmm0 = COPY [[COPY5]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call ninf afn float @llvm.sqrt.f32(float %f) + %call = tail call ninf afn contract float @llvm.sqrt.f32(float %f) ret float %call } @@ -71,17 +71,17 @@ define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf contract afn VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr - ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]] ; CHECK-NEXT: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS ; CHECK-NEXT: [[VCMPSSrri:%[0-9]+]]:fr32 = nofpexcept VCMPSSrri [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr @@ -90,7 +90,7 @@ define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]] ; CHECK-NEXT: $xmm0 = COPY [[COPY3]] ; CHECK-NEXT: RET 0, $xmm0 - %call = tail call ninf afn float @llvm.sqrt.f32(float %f) + %call = tail call ninf afn contract float @llvm.sqrt.f32(float %f) ret float %call } diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index bb7245c..82e840b 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2213,12 +2213,12 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE2-NEXT: movq %rdi, %rax ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5 ; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,0,41,183,1,1,161,221] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,u,0,u,41,u,183,u,1,u,1,u,161,u,221,u] ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; CHECK-SSE2-NEXT: pand %xmm4, %xmm5 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,103,183,171,61,1,127,183] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,u,103,u,183,u,171,u,61,u,1,u,127,u,183,u] ; CHECK-SSE2-NEXT: pand %xmm4, %xmm6 ; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 ; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 @@ -2242,10 +2242,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE2-NEXT: por %xmm7, %xmm5 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,223,205,183,161,1,171,239] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,u,223,u,205,u,183,u,161,u,1,u,171,u,239,u] ; CHECK-SSE2-NEXT: pand %xmm4, %xmm1 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,205,27,241,1,1,1,163] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,u,205,u,27,u,241,u,1,u,1,u,1,u,163,u] ; CHECK-SSE2-NEXT: pand %xmm4, %xmm0 ; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 ; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2275,8 +2275,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE41-NEXT: movq %rdi, %rax -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221] +; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6 @@ -2302,8 +2302,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1 ; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] ; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm0 -; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239] +; CHECK-SSE41-NEXT: pmullw %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 ; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239] ; CHECK-SSE41-NEXT: psllw $8, %xmm4 @@ -2341,7 +2341,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47] ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4 -; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1] ; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1] @@ -2361,7 +2361,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpaddb %xmm4, %xmm6, %xmm4 -; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm6 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm6 # [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60] ; CHECK-AVX1-NEXT: vpsllw $8, %xmm4, %xmm4 @@ -2375,7 +2375,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147] ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 -; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 # [0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1] ; CHECK-AVX1-NEXT: vpsllw $8, %xmm8, %xmm8 @@ -2394,7 +2394,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5 ; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5 -; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117] ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 ; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117] ; CHECK-AVX1-NEXT: vpsllw $8, %xmm5, %xmm5 @@ -2423,7 +2423,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47,0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147] ; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0] +; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1] ; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1] @@ -2443,7 +2443,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpaddb %ymm3, %ymm4, %ymm3 -; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0,3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0] +; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60,3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117] ; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60,0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117] ; CHECK-AVX2-NEXT: vpsllw $8, %ymm3, %ymm3 @@ -2458,7 +2458,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; ; CHECK-AVX512VL-LABEL: pr51133: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0] +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239,171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221] ; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221] ; CHECK-AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3 ; CHECK-AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst) diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll index e936e1e..0fb6eb3 100644 --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; SSE-LABEL: fold_srem_vec_1: @@ -55,55 +57,105 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_srem_vec_1: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: subl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $9, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: movswl %dx, %esi -; AVX-NEXT: shrl $15, %edx -; AVX-NEXT: sarl $6, %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: imull $95, %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $21, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $-124, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $18, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: imull $98, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: fold_srem_vec_1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax +; AVX1OR2-NEXT: movswl %ax, %ecx +; AVX1OR2-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 +; AVX1OR2-NEXT: shrl $16, %ecx +; AVX1OR2-NEXT: subl %eax, %ecx +; AVX1OR2-NEXT: movzwl %cx, %ecx +; AVX1OR2-NEXT: movswl %cx, %edx +; AVX1OR2-NEXT: shrl $15, %ecx +; AVX1OR2-NEXT: sarl $9, %edx +; AVX1OR2-NEXT: addl %ecx, %edx +; AVX1OR2-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 +; AVX1OR2-NEXT: subl %ecx, %eax +; AVX1OR2-NEXT: vmovd %xmm0, %ecx +; AVX1OR2-NEXT: movswl %cx, %edx +; AVX1OR2-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 +; AVX1OR2-NEXT: shrl $16, %edx +; AVX1OR2-NEXT: addl %ecx, %edx +; AVX1OR2-NEXT: movzwl %dx, %edx +; AVX1OR2-NEXT: movswl %dx, %esi +; AVX1OR2-NEXT: shrl $15, %edx +; AVX1OR2-NEXT: sarl $6, %esi +; AVX1OR2-NEXT: addl %edx, %esi +; AVX1OR2-NEXT: imull $95, %esi, %edx +; AVX1OR2-NEXT: subl %edx, %ecx +; AVX1OR2-NEXT: vmovd %ecx, %xmm1 +; AVX1OR2-NEXT: vpextrw $1, %xmm0, %ecx +; AVX1OR2-NEXT: movswl %cx, %edx +; AVX1OR2-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF +; AVX1OR2-NEXT: movl %edx, %esi +; AVX1OR2-NEXT: shrl $31, %esi +; AVX1OR2-NEXT: sarl $21, %edx +; AVX1OR2-NEXT: addl %esi, %edx +; AVX1OR2-NEXT: imull $-124, %edx, %edx +; AVX1OR2-NEXT: subl %edx, %ecx +; AVX1OR2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpextrw $2, %xmm0, %ecx +; AVX1OR2-NEXT: movswl %cx, %edx +; AVX1OR2-NEXT: imull $2675, %edx, %edx # imm = 0xA73 +; AVX1OR2-NEXT: movl %edx, %esi +; AVX1OR2-NEXT: shrl $31, %esi +; AVX1OR2-NEXT: sarl $18, %edx +; AVX1OR2-NEXT: addl %esi, %edx +; AVX1OR2-NEXT: imull $98, %edx, %edx +; AVX1OR2-NEXT: subl %edx, %ecx +; AVX1OR2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: fold_srem_vec_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrw $3, %xmm0, %eax +; AVX512-NEXT: movswl %ax, %ecx +; AVX512-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: subl %eax, %ecx +; AVX512-NEXT: movzwl %cx, %edx +; AVX512-NEXT: movswl %dx, %ecx +; AVX512-NEXT: shrl $15, %edx +; AVX512-NEXT: sarl $9, %ecx +; AVX512-NEXT: addl %edx, %ecx +; AVX512-NEXT: vmovd %xmm0, %edx +; AVX512-NEXT: movswl %dx, %esi +; AVX512-NEXT: imull $-21385, %esi, %esi # imm = 0xAC77 +; AVX512-NEXT: shrl $16, %esi +; AVX512-NEXT: addl %edx, %esi +; AVX512-NEXT: movzwl %si, %esi +; AVX512-NEXT: movswl %si, %edi +; AVX512-NEXT: shrl $15, %esi +; AVX512-NEXT: sarl $6, %edi +; AVX512-NEXT: addl %esi, %edi +; AVX512-NEXT: imull $95, %edi, %esi +; AVX512-NEXT: subl %esi, %edx +; AVX512-NEXT: vmovd %edx, %xmm1 +; AVX512-NEXT: vpextrw $1, %xmm0, %edx +; AVX512-NEXT: movswl %dx, %esi +; AVX512-NEXT: imull $-16913, %esi, %esi # imm = 0xBDEF +; AVX512-NEXT: movl %esi, %edi +; AVX512-NEXT: shrl $31, %edi +; AVX512-NEXT: sarl $21, %esi +; AVX512-NEXT: addl %edi, %esi +; AVX512-NEXT: imull $-1003, %ecx, %ecx # imm = 0xFC15 +; AVX512-NEXT: imull $-124, %esi, %esi +; AVX512-NEXT: subl %esi, %edx +; AVX512-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 +; AVX512-NEXT: vpextrw $2, %xmm0, %edx +; AVX512-NEXT: subl %ecx, %eax +; AVX512-NEXT: movswl %dx, %ecx +; AVX512-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 +; AVX512-NEXT: movl %ecx, %esi +; AVX512-NEXT: shrl $31, %esi +; AVX512-NEXT: sarl $18, %ecx +; AVX512-NEXT: addl %esi, %ecx +; AVX512-NEXT: imull $98, %ecx, %ecx +; AVX512-NEXT: subl %ecx, %edx +; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm0 +; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003> ret <4 x i16> %1 } @@ -139,20 +191,35 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; Don't fold if we can combine srem with sdiv. define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { -; SSE-LABEL: combine_srem_sdiv: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhw %xmm0, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrlw $15, %xmm2 -; SSE-NEXT: psraw $6, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] -; SSE-NEXT: pmullw %xmm1, %xmm2 -; SSE-NEXT: psubw %xmm2, %xmm0 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: combine_srem_sdiv: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrlw $15, %xmm2 +; SSE2-NEXT: psraw $6, %xmm1 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE2-NEXT: pmullw %xmm1, %xmm2 +; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: combine_srem_sdiv: +; SSE4: # %bb.0: +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE4-NEXT: pmulhw %xmm0, %xmm1 +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm2 +; SSE4-NEXT: psrlw $15, %xmm2 +; SSE4-NEXT: psraw $6, %xmm1 +; SSE4-NEXT: paddw %xmm2, %xmm1 +; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE4-NEXT: pmullw %xmm1, %xmm2 +; SSE4-NEXT: psubw %xmm2, %xmm0 +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: retq ; ; AVX-LABEL: combine_srem_sdiv: ; AVX: # %bb.0: @@ -421,48 +488,93 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; Don't fold i64 srem. define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { -; SSE-LABEL: dont_fold_srem_i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: addq %rcx, %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $4, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: leaq (%rdx,%rdx,2), %rax -; SSE-NEXT: shlq $3, %rax -; SSE-NEXT: subq %rax, %rdx -; SSE-NEXT: addq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: pextrq $1, %xmm2, %rcx -; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $11, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: imulq %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: shrq $63, %rax -; SSE-NEXT: sarq $8, %rdx -; SSE-NEXT: addq %rax, %rdx -; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE-NEXT: retq +; SSE2-LABEL: dont_fold_srem_i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: imulq %rdx +; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: movq %rdx, %rax +; SSE2-NEXT: shrq $63, %rax +; SSE2-NEXT: sarq $4, %rdx +; SSE2-NEXT: addq %rax, %rdx +; SSE2-NEXT: leaq (%rdx,%rdx,2), %rax +; SSE2-NEXT: shlq $3, %rax +; SSE2-NEXT: subq %rax, %rdx +; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: imulq %rdx +; SSE2-NEXT: movq %rdx, %rax +; SSE2-NEXT: shrq $63, %rax +; SSE2-NEXT: sarq $11, %rdx +; SSE2-NEXT: addq %rax, %rdx +; SSE2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: imulq %rdx +; SSE2-NEXT: movq %rdx, %rax +; SSE2-NEXT: shrq $63, %rax +; SSE2-NEXT: sarq $8, %rdx +; SSE2-NEXT: addq %rax, %rdx +; SSE2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: retq +; +; SSE4-LABEL: dont_fold_srem_i64: +; SSE4: # %bb.0: +; SSE4-NEXT: movdqa %xmm1, %xmm2 +; SSE4-NEXT: movq %xmm1, %rcx +; SSE4-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; SSE4-NEXT: movq %rcx, %rax +; SSE4-NEXT: imulq %rdx +; SSE4-NEXT: addq %rcx, %rdx +; SSE4-NEXT: movq %rdx, %rax +; SSE4-NEXT: shrq $63, %rax +; SSE4-NEXT: sarq $4, %rdx +; SSE4-NEXT: addq %rax, %rdx +; SSE4-NEXT: leaq (%rdx,%rdx,2), %rax +; SSE4-NEXT: shlq $3, %rax +; SSE4-NEXT: subq %rax, %rdx +; SSE4-NEXT: addq %rcx, %rdx +; SSE4-NEXT: movq %rdx, %xmm1 +; SSE4-NEXT: pextrq $1, %xmm2, %rcx +; SSE4-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; SSE4-NEXT: movq %rcx, %rax +; SSE4-NEXT: imulq %rdx +; SSE4-NEXT: movq %rdx, %rax +; SSE4-NEXT: shrq $63, %rax +; SSE4-NEXT: sarq $11, %rdx +; SSE4-NEXT: addq %rax, %rdx +; SSE4-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE4-NEXT: subq %rax, %rcx +; SSE4-NEXT: movq %rcx, %xmm2 +; SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE4-NEXT: pextrq $1, %xmm0, %rcx +; SSE4-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE4-NEXT: movq %rcx, %rax +; SSE4-NEXT: imulq %rdx +; SSE4-NEXT: movq %rdx, %rax +; SSE4-NEXT: shrq $63, %rax +; SSE4-NEXT: sarq $8, %rdx +; SSE4-NEXT: addq %rax, %rdx +; SSE4-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE4-NEXT: subq %rax, %rcx +; SSE4-NEXT: movq %rcx, %xmm0 +; SSE4-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE4-NEXT: retq ; ; AVX1-LABEL: dont_fold_srem_i64: ; AVX1: # %bb.0: @@ -551,6 +663,50 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512-LABEL: dont_fold_srem_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rcx +; AVX512-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: imulq %rdx +; AVX512-NEXT: addq %rcx, %rdx +; AVX512-NEXT: movq %rdx, %rax +; AVX512-NEXT: shrq $63, %rax +; AVX512-NEXT: sarq $4, %rdx +; AVX512-NEXT: addq %rax, %rdx +; AVX512-NEXT: leaq (%rdx,%rdx,2), %rax +; AVX512-NEXT: shlq $3, %rax +; AVX512-NEXT: subq %rax, %rdx +; AVX512-NEXT: addq %rcx, %rdx +; AVX512-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: imulq %rdx +; AVX512-NEXT: movq %rdx, %rax +; AVX512-NEXT: shrq $63, %rax +; AVX512-NEXT: sarq $11, %rdx +; AVX512-NEXT: addq %rax, %rdx +; AVX512-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0] +; AVX512-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: imulq %rdx +; AVX512-NEXT: movq %rdx, %rax +; AVX512-NEXT: shrq $63, %rax +; AVX512-NEXT: sarq $8, %rdx +; AVX512-NEXT: addq %rax, %rdx +; AVX512-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: retq %1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423> ret <4 x i64> %1 } diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll index e5ea911..a93be22 100644 --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -15,16 +15,16 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movswl %di, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %esi ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testw %di, %di +; X64-NEXT: testw %dx, %dx ; X64-NEXT: sets %al ; X64-NEXT: addl $32767, %eax # imm = 0x7FFF -; X64-NEXT: cmpw %si, %di -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; @@ -33,17 +33,17 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movswl %dx, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %al ; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx -; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -58,18 +58,18 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax -; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: xorl %esi, %esi ; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %dl -; X64-NEXT: addl $32767, %edx # imm = 0x7FFF -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movswl %si, %edi +; X64-NEXT: sets %sil +; X64-NEXT: addl $32767, %esi # imm = 0x7FFF +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movswl %ax, %edi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %edi -; X64-NEXT: cmpw %di, %ax -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movswl %si, %eax +; X64-NEXT: cmpw %di, %dx +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: cwtl ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index 10dee14..ff76707 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -365,119 +365,118 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movswl %bx, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movswl %di, %ebp ; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %di, %di +; X86-NEXT: testw %bx, %bx ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %bp, %di -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovel %ebx, %ecx +; X86-NEXT: cmpw %bp, %bx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmovel %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movswl %di, %ebx -; X86-NEXT: sarl %cl, %ebx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %bx, %si ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movswl %si, %edi ; X86-NEXT: sarl %cl, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %bx, %bx +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmpw %di, %bx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx +; X86-NEXT: cmovel %esi, %ebp ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movswl %dx, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %di, %di ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %di +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edx +; X86-NEXT: sarl %cl, %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmpw %dx, %si +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: cmovel %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi -; X86-NEXT: sarl %cl, %esi +; X86-NEXT: movswl %dx, %eax +; X86-NEXT: sarl %cl, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmpw %ax, %si ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovel %edx, %ecx -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %dx, %dx ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: cmovel %edx, %ebx +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: cmovel %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %dl ; X86-NEXT: addl $32767, %edx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %ecx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movw %cx, 14(%eax) ; X86-NEXT: movw %dx, 12(%eax) ; X86-NEXT: movw %bx, 10(%eax) -; X86-NEXT: movw %bp, 8(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 6(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, 2(%eax) +; X86-NEXT: movw %bp, 2(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $16, %esp diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll index cd576b1..345fa0e 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll @@ -4,16 +4,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" -declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>) declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>) declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>) declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>) declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -23,11 +23,11 @@ define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a ; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) ret <4 x i32> %2 } -define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -37,11 +37,11 @@ define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 ; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1) ret <4 x i32> %2 } -define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -51,11 +51,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32 ; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) ret <8 x i32> %2 } -define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -65,11 +65,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, ; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1) ret <8 x i32> %2 } -define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -79,11 +79,11 @@ define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> % ; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) ret <4 x i32> %2 } -define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -93,11 +93,11 @@ define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 ; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1) ret <4 x i32> %2 } -define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -107,11 +107,11 @@ define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3 ; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) ret <8 x i32> %2 } -define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -121,7 +121,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1 ; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1) ret <8 x i32> %2 } diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll index 534352f..47537c8 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s -declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) -declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) -declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) -declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) @@ -14,7 +14,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -26,11 +26,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8] ; CHECK-NEXT: retq # encoding: [0xc3] %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -42,11 +42,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8] ; CHECK-NEXT: retq # encoding: [0xc3] %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -58,11 +58,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8] ; CHECK-NEXT: retq # encoding: [0xc3] %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -74,7 +74,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8] ; CHECK-NEXT: retq # encoding: [0xc3] %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll index 72406aa..9bf88cb 100644 --- a/llvm/test/CodeGen/X86/stackmap.ll +++ b/llvm/test/CodeGen/X86/stackmap.ll @@ -1,7 +1,10 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -terminal-rule=0 | FileCheck %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. +; FIXME: Test should be fixed to produce the correct sized spill with +; -terminal-rule=0 flag removed + ; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps ; CHECK-NEXT: __LLVM_StackMaps: ; Header @@ -546,8 +549,8 @@ define void @clobberScratch(i32 %a) { ret void } -; A stack frame which needs to be realigned at runtime (to meet alignment -; criteria for values on the stack) does not have a fixed frame size. +; A stack frame which needs to be realigned at runtime (to meet alignment +; criteria for values on the stack) does not have a fixed frame size. ; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment ; CHECK-NEXT: .short 0 ; 0 locations diff --git a/llvm/test/CodeGen/X86/strictfp-inlineasm.ll b/llvm/test/CodeGen/X86/strictfp-inlineasm.ll new file mode 100644 index 0000000..674c12a --- /dev/null +++ b/llvm/test/CodeGen/X86/strictfp-inlineasm.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefixes=X64 + +define i32 @foo() strictfp { +; X86-LABEL: foo: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $1, %eax +; X86-NEXT: #APP +; X86-NEXT: #NO_APP +; X86-NEXT: movl $-1, %eax +; X86-NEXT: retl +; +; X64-LABEL: foo: +; X64: # %bb.0: # %entry +; X64-NEXT: movl $1, %eax +; X64-NEXT: #APP +; X64-NEXT: #NO_APP +; X64-NEXT: movl $-1, %eax +; X64-NEXT: retq +entry: + tail call void asm sideeffect "", "r"(i32 1) #1, !srcloc !0 + ret i32 -1 +} + + +!0 = !{i64 87} diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 5bd624c..01fbafb 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -2429,126 +2429,126 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY: # %bb.0: ; SSE2-ONLY-NEXT: movl (%rdi), %eax ; SSE2-ONLY-NEXT: notl %eax -; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: movl %eax, %ecx -; SSE2-ONLY-NEXT: shrl $16, %ecx -; SSE2-ONLY-NEXT: movb %cl, 2(%rsi) -; SSE2-ONLY-NEXT: movb %cl, 2(%rdx) -; SSE2-ONLY-NEXT: movw %ax, (%rdx) -; SSE2-ONLY-NEXT: movb %cl, 6(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 10(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 8(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 14(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 18(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 16(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 22(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 26(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 24(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 30(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 34(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 32(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 38(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 42(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 40(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 46(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 50(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 48(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 54(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 58(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 56(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 62(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) +; SSE2-ONLY-NEXT: movw %ax, (%rsi) +; SSE2-ONLY-NEXT: shrl $16, %eax +; SSE2-ONLY-NEXT: movb %al, 2(%rsi) +; SSE2-ONLY-NEXT: movb %al, 2(%rdx) +; SSE2-ONLY-NEXT: movw %cx, (%rdx) +; SSE2-ONLY-NEXT: movb %al, 6(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 4(%rdx) +; SSE2-ONLY-NEXT: movb %al, 10(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 8(%rdx) +; SSE2-ONLY-NEXT: movb %al, 14(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 12(%rdx) +; SSE2-ONLY-NEXT: movb %al, 18(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 16(%rdx) +; SSE2-ONLY-NEXT: movb %al, 22(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 20(%rdx) +; SSE2-ONLY-NEXT: movb %al, 26(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 24(%rdx) +; SSE2-ONLY-NEXT: movb %al, 30(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 28(%rdx) +; SSE2-ONLY-NEXT: movb %al, 34(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 32(%rdx) +; SSE2-ONLY-NEXT: movb %al, 38(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 36(%rdx) +; SSE2-ONLY-NEXT: movb %al, 42(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 40(%rdx) +; SSE2-ONLY-NEXT: movb %al, 46(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 44(%rdx) +; SSE2-ONLY-NEXT: movb %al, 50(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 48(%rdx) +; SSE2-ONLY-NEXT: movb %al, 54(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 52(%rdx) +; SSE2-ONLY-NEXT: movb %al, 58(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 56(%rdx) +; SSE2-ONLY-NEXT: movb %al, 62(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 60(%rdx) ; SSE2-ONLY-NEXT: retq ; ; SSE3-LABEL: vec384_v3i8: ; SSE3: # %bb.0: ; SSE3-NEXT: movl (%rdi), %eax ; SSE3-NEXT: notl %eax -; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: movl %eax, %ecx -; SSE3-NEXT: shrl $16, %ecx -; SSE3-NEXT: movb %cl, 2(%rsi) -; SSE3-NEXT: movb %cl, 2(%rdx) -; SSE3-NEXT: movw %ax, (%rdx) -; SSE3-NEXT: movb %cl, 6(%rdx) -; SSE3-NEXT: movw %ax, 4(%rdx) -; SSE3-NEXT: movb %cl, 10(%rdx) -; SSE3-NEXT: movw %ax, 8(%rdx) -; SSE3-NEXT: movb %cl, 14(%rdx) -; SSE3-NEXT: movw %ax, 12(%rdx) -; SSE3-NEXT: movb %cl, 18(%rdx) -; SSE3-NEXT: movw %ax, 16(%rdx) -; SSE3-NEXT: movb %cl, 22(%rdx) -; SSE3-NEXT: movw %ax, 20(%rdx) -; SSE3-NEXT: movb %cl, 26(%rdx) -; SSE3-NEXT: movw %ax, 24(%rdx) -; SSE3-NEXT: movb %cl, 30(%rdx) -; SSE3-NEXT: movw %ax, 28(%rdx) -; SSE3-NEXT: movb %cl, 34(%rdx) -; SSE3-NEXT: movw %ax, 32(%rdx) -; SSE3-NEXT: movb %cl, 38(%rdx) -; SSE3-NEXT: movw %ax, 36(%rdx) -; SSE3-NEXT: movb %cl, 42(%rdx) -; SSE3-NEXT: movw %ax, 40(%rdx) -; SSE3-NEXT: movb %cl, 46(%rdx) -; SSE3-NEXT: movw %ax, 44(%rdx) -; SSE3-NEXT: movb %cl, 50(%rdx) -; SSE3-NEXT: movw %ax, 48(%rdx) -; SSE3-NEXT: movb %cl, 54(%rdx) -; SSE3-NEXT: movw %ax, 52(%rdx) -; SSE3-NEXT: movb %cl, 58(%rdx) -; SSE3-NEXT: movw %ax, 56(%rdx) -; SSE3-NEXT: movb %cl, 62(%rdx) -; SSE3-NEXT: movw %ax, 60(%rdx) +; SSE3-NEXT: movw %ax, (%rsi) +; SSE3-NEXT: shrl $16, %eax +; SSE3-NEXT: movb %al, 2(%rsi) +; SSE3-NEXT: movb %al, 2(%rdx) +; SSE3-NEXT: movw %cx, (%rdx) +; SSE3-NEXT: movb %al, 6(%rdx) +; SSE3-NEXT: movw %cx, 4(%rdx) +; SSE3-NEXT: movb %al, 10(%rdx) +; SSE3-NEXT: movw %cx, 8(%rdx) +; SSE3-NEXT: movb %al, 14(%rdx) +; SSE3-NEXT: movw %cx, 12(%rdx) +; SSE3-NEXT: movb %al, 18(%rdx) +; SSE3-NEXT: movw %cx, 16(%rdx) +; SSE3-NEXT: movb %al, 22(%rdx) +; SSE3-NEXT: movw %cx, 20(%rdx) +; SSE3-NEXT: movb %al, 26(%rdx) +; SSE3-NEXT: movw %cx, 24(%rdx) +; SSE3-NEXT: movb %al, 30(%rdx) +; SSE3-NEXT: movw %cx, 28(%rdx) +; SSE3-NEXT: movb %al, 34(%rdx) +; SSE3-NEXT: movw %cx, 32(%rdx) +; SSE3-NEXT: movb %al, 38(%rdx) +; SSE3-NEXT: movw %cx, 36(%rdx) +; SSE3-NEXT: movb %al, 42(%rdx) +; SSE3-NEXT: movw %cx, 40(%rdx) +; SSE3-NEXT: movb %al, 46(%rdx) +; SSE3-NEXT: movw %cx, 44(%rdx) +; SSE3-NEXT: movb %al, 50(%rdx) +; SSE3-NEXT: movw %cx, 48(%rdx) +; SSE3-NEXT: movb %al, 54(%rdx) +; SSE3-NEXT: movw %cx, 52(%rdx) +; SSE3-NEXT: movb %al, 58(%rdx) +; SSE3-NEXT: movw %cx, 56(%rdx) +; SSE3-NEXT: movb %al, 62(%rdx) +; SSE3-NEXT: movw %cx, 60(%rdx) ; SSE3-NEXT: retq ; ; SSSE3-ONLY-LABEL: vec384_v3i8: ; SSSE3-ONLY: # %bb.0: ; SSSE3-ONLY-NEXT: movl (%rdi), %eax ; SSSE3-ONLY-NEXT: notl %eax -; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: movl %eax, %ecx -; SSSE3-ONLY-NEXT: shrl $16, %ecx -; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi) -; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, (%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, (%rsi) +; SSSE3-ONLY-NEXT: shrl $16, %eax +; SSSE3-ONLY-NEXT: movb %al, 2(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 2(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, (%rdx) +; SSSE3-ONLY-NEXT: movb %al, 6(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 4(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 10(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 8(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 14(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 12(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 18(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 16(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 22(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 20(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 26(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 24(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 30(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 28(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 34(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 32(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 38(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 36(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 42(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 40(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 46(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 44(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 50(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 48(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 54(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 52(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 58(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 56(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 62(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 60(%rdx) ; SSSE3-ONLY-NEXT: retq ; ; SSE41-LABEL: vec384_v3i8: diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll index f20b777..3ad3e9a 100644 --- a/llvm/test/CodeGen/X86/twoaddr-lea.ll +++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll @@ -65,10 +65,10 @@ entry: define void @ham() { ; CHECK-LABEL: ham: ; CHECK: ## %bb.0: ## %bb +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rdx ; CHECK-NEXT: movq _global2@GOTPCREL(%rip), %rsi -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: je LBB3_2 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll new file mode 100644 index 0000000..6739be5 --- /dev/null +++ b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll @@ -0,0 +1,166 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64 | FileCheck %s + +; GitHub issue #161036 + +; Positive test : umin(sub(a,b),a) with scalar types should be folded +define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) { +; CHECK-LABEL: underflow_compare_fold_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: subq %rsi, %rax +; CHECK-NEXT: cmpq %rdi, %rax +; CHECK-NEXT: cmovaeq %rdi, %rax +; CHECK-NEXT: retq + %sub = sub i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +; Positive test : umin(a,sub(a,b)) with scalar types should be folded +define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) { +; CHECK-LABEL: underflow_compare_fold_i64_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: subq %rsi, %rax +; CHECK-NEXT: cmpq %rax, %rdi +; CHECK-NEXT: cmovbq %rdi, %rax +; CHECK-NEXT: retq + %sub = sub i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub) + ret i64 %cond +} + +; Positive test : multi-use is OK since the sub instruction still runs once +define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i64_multi_use: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: subq %rsi, %rax +; CHECK-NEXT: movq %rax, (%rdx) +; CHECK-NEXT: cmpq %rdi, %rax +; CHECK-NEXT: cmovaeq %rdi, %rax +; CHECK-NEXT: retq + %sub = sub i64 %a, %b + store i64 %sub, ptr addrspace(1) %ptr + %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a) + ret i64 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) { +; CHECK-LABEL: underflow_compare_fold_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: cmpl %edi, %eax +; CHECK-NEXT: cmovael %edi, %eax +; CHECK-NEXT: retq + %sub = sub i32 %a, %b + %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) { +; CHECK-LABEL: underflow_compare_fold_i32_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: retq + %sub = sub i32 %a, %b + %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub) + ret i32 %cond +} + +; Positive test : i32 +define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i32_multi_use: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: movl %eax, (%rdx) +; CHECK-NEXT: cmpl %edi, %eax +; CHECK-NEXT: cmovael %edi, %eax +; CHECK-NEXT: retq + %sub = sub i32 %a, %b + store i32 %sub, ptr addrspace(1) %ptr + %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a) + ret i32 %cond +} + +; Positive test : i16 +define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) { +; CHECK-LABEL: underflow_compare_fold_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: cmpw %di, %ax +; CHECK-NEXT: cmovael %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %sub = sub i16 %a, %b + %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + +; Positive test : i16 +define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) { +; CHECK-LABEL: underflow_compare_fold_i16_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: cmpw %ax, %di +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %sub = sub i16 %a, %b + %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub) + ret i16 %cond +} + +; Positive test : i16 +define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) { +; CHECK-LABEL: underflow_compare_fold_i16_multi_use: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: subl %esi, %eax +; CHECK-NEXT: movw %ax, (%rdx) +; CHECK-NEXT: cmpw %di, %ax +; CHECK-NEXT: cmovael %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %sub = sub i16 %a, %b + store i16 %sub, ptr addrspace(1) %ptr + %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a) + ret i16 %cond +} + + +; Negative test, vector types : umin(sub(a,b),a) but with vectors +define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: underflow_compare_dontfold_vectors: +; CHECK: # %bb.0: +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: psubb %xmm1, %xmm2 +; CHECK-NEXT: pminub %xmm2, %xmm0 +; CHECK-NEXT: retq + %sub = sub <16 x i8> %a, %b + %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a) + ret <16 x i8> %cond +} + +; Negative test, pattern mismatch : umin(add(a,b),a) +define i64 @umin_add(i64 %a, i64 %b) { +; CHECK-LABEL: umin_add: +; CHECK: # %bb.0: +; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: cmpq %rdi, %rax +; CHECK-NEXT: cmovaeq %rdi, %rax +; CHECK-NEXT: retq + %add = add i64 %a, %b + %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a) + ret i64 %cond +} diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index eacc714..5a68484 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 759055d..1a92365 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -138,22 +138,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,u,819,u] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1024,2048,2048,2] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2048,u,2,u] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: pslld $10, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE2-NEXT: orps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll index 94c7892..3d0d73b 100644 --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; SSE-LABEL: fold_urem_vec_1: @@ -110,16 +112,27 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; Don't fold if we can combine urem with udiv. define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { -; SSE-LABEL: combine_urem_udiv: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] -; SSE-NEXT: pmulhuw %xmm0, %xmm1 -; SSE-NEXT: psrlw $6, %xmm1 -; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] -; SSE-NEXT: pmullw %xmm1, %xmm2 -; SSE-NEXT: psubw %xmm2, %xmm0 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: combine_urem_udiv: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 +; SSE2-NEXT: psrlw $6, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE2-NEXT: pmullw %xmm1, %xmm2 +; SSE2-NEXT: psubw %xmm2, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: combine_urem_udiv: +; SSE4: # %bb.0: +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE4-NEXT: pmulhuw %xmm0, %xmm1 +; SSE4-NEXT: psrlw $6, %xmm1 +; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE4-NEXT: pmullw %xmm1, %xmm2 +; SSE4-NEXT: psubw %xmm2, %xmm0 +; SSE4-NEXT: paddw %xmm1, %xmm0 +; SSE4-NEXT: retq ; ; AVX-LABEL: combine_urem_udiv: ; AVX: # %bb.0: @@ -137,24 +150,43 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; Don't fold for divisors that are a power of two. define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { -; SSE-LABEL: dont_fold_urem_power_of_two: -; SSE: # %bb.0: -; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: andl $31, %eax -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: andl $7, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %ecx -; SSE-NEXT: imull $95, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: dont_fold_urem_power_of_two: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: andl $31, %eax +; SSE2-NEXT: pinsrw $1, %eax, %xmm1 +; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: andl $7, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; SSE2-NEXT: shrl $22, %ecx +; SSE2-NEXT: imull $95, %ecx, %ecx +; SSE2-NEXT: subl %ecx, %eax +; SSE2-NEXT: pinsrw $3, %eax, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: dont_fold_urem_power_of_two: +; SSE4: # %bb.0: +; SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63] +; SSE4-NEXT: pand %xmm0, %xmm1 +; SSE4-NEXT: pextrw $1, %xmm0, %eax +; SSE4-NEXT: andl $31, %eax +; SSE4-NEXT: pinsrw $1, %eax, %xmm1 +; SSE4-NEXT: pextrw $2, %xmm0, %eax +; SSE4-NEXT: andl $7, %eax +; SSE4-NEXT: pinsrw $2, %eax, %xmm1 +; SSE4-NEXT: pextrw $3, %xmm0, %eax +; SSE4-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; SSE4-NEXT: shrl $22, %ecx +; SSE4-NEXT: imull $95, %ecx, %ecx +; SSE4-NEXT: subl %ecx, %eax +; SSE4-NEXT: pinsrw $3, %eax, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: dont_fold_urem_power_of_two: ; AVX1: # %bb.0: @@ -190,6 +222,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; AVX2-NEXT: subl %ecx, %eax ; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX2-NEXT: retq +; +; AVX512-LABEL: dont_fold_urem_power_of_two: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; AVX512-NEXT: vpextrw $1, %xmm0, %eax +; AVX512-NEXT: andl $31, %eax +; AVX512-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vpextrw $2, %xmm0, %eax +; AVX512-NEXT: andl $7, %eax +; AVX512-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vpextrw $3, %xmm0, %eax +; AVX512-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; AVX512-NEXT: shrl $22, %ecx +; AVX512-NEXT: imull $95, %ecx, %ecx +; AVX512-NEXT: subl %ecx, %eax +; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX512-NEXT: retq %1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95> ret <4 x i16> %1 } @@ -228,36 +277,67 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: dont_fold_urem_one: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $4, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; AVX-NEXT: shrl $25, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; AVX-NEXT: shrl $26, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: dont_fold_urem_one: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpextrw $2, %xmm0, %eax +; AVX1OR2-NEXT: imull $25645, %eax, %ecx # imm = 0x642D +; AVX1OR2-NEXT: shrl $16, %ecx +; AVX1OR2-NEXT: movl %eax, %edx +; AVX1OR2-NEXT: subl %ecx, %edx +; AVX1OR2-NEXT: movzwl %dx, %edx +; AVX1OR2-NEXT: shrl %edx +; AVX1OR2-NEXT: addl %ecx, %edx +; AVX1OR2-NEXT: shrl $4, %edx +; AVX1OR2-NEXT: leal (%rdx,%rdx,2), %ecx +; AVX1OR2-NEXT: shll $3, %ecx +; AVX1OR2-NEXT: subl %ecx, %edx +; AVX1OR2-NEXT: addl %eax, %edx +; AVX1OR2-NEXT: vpextrw $1, %xmm0, %eax +; AVX1OR2-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B +; AVX1OR2-NEXT: shrl $25, %ecx +; AVX1OR2-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; AVX1OR2-NEXT: subl %ecx, %eax +; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax +; AVX1OR2-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 +; AVX1OR2-NEXT: shrl $26, %ecx +; AVX1OR2-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; AVX1OR2-NEXT: subl %ecx, %eax +; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: dont_fold_urem_one: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrw $2, %xmm0, %eax +; AVX512-NEXT: imull $25645, %eax, %ecx # imm = 0x642D +; AVX512-NEXT: shrl $16, %ecx +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: subl %ecx, %edx +; AVX512-NEXT: movzwl %dx, %edx +; AVX512-NEXT: shrl %edx +; AVX512-NEXT: addl %ecx, %edx +; AVX512-NEXT: shrl $4, %edx +; AVX512-NEXT: leal (%rdx,%rdx,2), %ecx +; AVX512-NEXT: shll $3, %ecx +; AVX512-NEXT: subl %ecx, %edx +; AVX512-NEXT: vpextrw $1, %xmm0, %ecx +; AVX512-NEXT: addl %eax, %edx +; AVX512-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B +; AVX512-NEXT: shrl $25, %eax +; AVX512-NEXT: imull $654, %eax, %eax # imm = 0x28E +; AVX512-NEXT: subl %eax, %ecx +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX512-NEXT: vpextrw $3, %xmm0, %eax +; AVX512-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 +; AVX512-NEXT: shrl $26, %ecx +; AVX512-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; AVX512-NEXT: subl %ecx, %eax +; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX512-NEXT: retq %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423> ret <4 x i16> %1 } @@ -267,49 +347,96 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_i16_smax: ; CHECK: # %bb.0: ; CHECK-NEXT: retq +; SSE-LABEL: dont_fold_urem_i16_smax: +; SSE: # %bb.0: +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_i16_smax: +; AVX: # %bb.0: +; AVX-NEXT: retq %1 = urem <4 x i16> %x, <i16 1, i16 65536, i16 23, i16 5423> ret <4 x i16> %1 } ; Don't fold i64 urem. define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { -; SSE-LABEL: dont_fold_urem_i64: -; SSE: # %bb.0: -; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: mulq %rdx -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: subq %rdx, %rax -; SSE-NEXT: shrq %rax -; SSE-NEXT: addq %rdx, %rax -; SSE-NEXT: shrq $4, %rax -; SSE-NEXT: leaq (%rax,%rax,2), %rdx -; SSE-NEXT: shlq $3, %rdx -; SSE-NEXT: subq %rdx, %rax -; SSE-NEXT: addq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: pextrq $1, %xmm1, %rcx -; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: mulq %rdx -; SSE-NEXT: shrq $12, %rdx -; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pextrq $1, %xmm0, %rcx -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: shrq %rax -; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 -; SSE-NEXT: mulq %rdx -; SSE-NEXT: shrq $7, %rdx -; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E -; SSE-NEXT: subq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: dont_fold_urem_i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: mulq %rdx +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: subq %rdx, %rax +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: addq %rdx, %rax +; SSE2-NEXT: shrq $4, %rax +; SSE2-NEXT: leaq (%rax,%rax,2), %rdx +; SSE2-NEXT: shlq $3, %rdx +; SSE2-NEXT: subq %rdx, %rax +; SSE2-NEXT: addq %rcx, %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: mulq %rdx +; SSE2-NEXT: shrq $12, %rdx +; SSE2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: movq %rcx, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE2-NEXT: mulq %rdx +; SSE2-NEXT: shrq $7, %rdx +; SSE2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE2-NEXT: subq %rax, %rcx +; SSE2-NEXT: movq %rcx, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: retq +; +; SSE4-LABEL: dont_fold_urem_i64: +; SSE4: # %bb.0: +; SSE4-NEXT: movq %xmm1, %rcx +; SSE4-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; SSE4-NEXT: movq %rcx, %rax +; SSE4-NEXT: mulq %rdx +; SSE4-NEXT: movq %rcx, %rax +; SSE4-NEXT: subq %rdx, %rax +; SSE4-NEXT: shrq %rax +; SSE4-NEXT: addq %rdx, %rax +; SSE4-NEXT: shrq $4, %rax +; SSE4-NEXT: leaq (%rax,%rax,2), %rdx +; SSE4-NEXT: shlq $3, %rdx +; SSE4-NEXT: subq %rdx, %rax +; SSE4-NEXT: addq %rcx, %rax +; SSE4-NEXT: movq %rax, %xmm2 +; SSE4-NEXT: pextrq $1, %xmm1, %rcx +; SSE4-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; SSE4-NEXT: movq %rcx, %rax +; SSE4-NEXT: mulq %rdx +; SSE4-NEXT: shrq $12, %rdx +; SSE4-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE4-NEXT: subq %rax, %rcx +; SSE4-NEXT: movq %rcx, %xmm1 +; SSE4-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE4-NEXT: pextrq $1, %xmm0, %rcx +; SSE4-NEXT: movq %rcx, %rax +; SSE4-NEXT: shrq %rax +; SSE4-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE4-NEXT: mulq %rdx +; SSE4-NEXT: shrq $7, %rdx +; SSE4-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE4-NEXT: subq %rax, %rcx +; SSE4-NEXT: movq %rcx, %xmm0 +; SSE4-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE4-NEXT: movdqa %xmm2, %xmm1 +; SSE4-NEXT: retq ; ; AVX1-LABEL: dont_fold_urem_i64: ; AVX1: # %bb.0: @@ -388,6 +515,43 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512-LABEL: dont_fold_urem_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rdx +; AVX512-NEXT: movabsq $7218291159277650633, %rax # imm = 0x642C8590B21642C9 +; AVX512-NEXT: mulxq %rax, %rax, %rax +; AVX512-NEXT: movq %rdx, %rcx +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: shrq %rcx +; AVX512-NEXT: addq %rax, %rcx +; AVX512-NEXT: shrq $4, %rcx +; AVX512-NEXT: leaq (%rcx,%rcx,2), %rax +; AVX512-NEXT: shlq $3, %rax +; AVX512-NEXT: subq %rax, %rcx +; AVX512-NEXT: addq %rdx, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512-NEXT: movabsq $-4513890722074972339, %rax # imm = 0xC15B704DCBCA2F4D +; AVX512-NEXT: mulxq %rax, %rax, %rax +; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: shrq $12, %rax +; AVX512-NEXT: imulq $5423, %rax, %rax # imm = 0x152F +; AVX512-NEXT: subq %rax, %rdx +; AVX512-NEXT: vmovq %rdx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: movq %rax, %rdx +; AVX512-NEXT: shrq %rdx +; AVX512-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5 +; AVX512-NEXT: mulxq %rcx, %rcx, %rcx +; AVX512-NEXT: shrq $7, %rcx +; AVX512-NEXT: imulq $654, %rcx, %rcx # imm = 0x28E +; AVX512-NEXT: subq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423> ret <4 x i64> %1 } diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll index e0e1ef7..9768e47 100644 --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -14,23 +14,23 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %eax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movzwl %di, %edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %eax -; X64-NEXT: cmpw %ax, %di +; X64-NEXT: shrl %cl, %edx +; X64-NEXT: cmpw %dx, %ax ; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X86-LABEL: func: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movzwl %dx, %esi ; X86-NEXT: shrl %cl, %esi @@ -51,14 +51,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax ; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %esi +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movzwl %ax, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %esi -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax -; X64-NEXT: cwtl +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-NEXT: cmovel %eax, %ecx +; X64-NEXT: movswl %cx, %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index b8e83da..762088c 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -300,95 +300,94 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %si +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %dx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $65535, %esi # imm = 0xFFFF -; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %bp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movzwl %bp, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %si -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bp, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpw %ax, %di +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %eax, %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bx, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $65535, %esi # imm = 0xFFFF ; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movzwl %bp, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movzwl %di, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax ; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl $65535, %ebx # imm = 0xFFFF ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl %si, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzwl %dx, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmpw %ax, %si +; X86-NEXT: cmovnel %ebx, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %bx ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) -; X86-NEXT: movw %si, 12(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) ; X86-NEXT: movw %di, 10(%ecx) -; X86-NEXT: movw %bx, 8(%ecx) -; X86-NEXT: movw %bp, 6(%ecx) +; X86-NEXT: movw %bp, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 6(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vector-compress-freeze.ll b/llvm/test/CodeGen/X86/vector-compress-freeze.ll new file mode 100644 index 0000000..981557f --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-compress-freeze.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl | FileCheck %s + +declare <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32>, <16 x i1>, <16 x i32>) + +define <16 x i32> @test_compress_freeze_elimination(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a3) { +; CHECK-LABEL: test_compress_freeze_elimination: +; CHECK: # %bb.0: +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpcompressd %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %cmp = icmp sgt <16 x i32> %a0, %a1 + %ext = zext <16 x i8> %a3 to <16 x i32> + %cpr = call <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32> %ext, <16 x i1> %cmp, <16 x i32> splat(i32 15)) + %fr = freeze <16 x i32> %cpr + %and = and <16 x i32> %fr, splat(i32 255) + ret <16 x i32> %and +} + +define <16 x i32> @test_compress_freeze(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a3) { +; CHECK-LABEL: test_compress_freeze: +; CHECK: # %bb.0: +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; CHECK-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq + %cmp = icmp sgt <16 x i32> %a0, %a1 + %ext = zext <16 x i8> %a3 to <16 x i32> + %cpr = call <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32> %ext, <16 x i1> %cmp, <16 x i32> poison) + %fr = freeze <16 x i32> %cpr + %and = and <16 x i32> %fr, splat(i32 255) + ret <16 x i32> %and +} diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 4a5b427..88d3ad1 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -4143,11 +4143,11 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -4155,10 +4155,10 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() #0 { ; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -4256,11 +4256,11 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 { ; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; AVX1-NEXT: vmovq %rdx, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -4268,11 +4268,11 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 { ; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -4382,11 +4382,11 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -4394,10 +4394,10 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 { ; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -4498,11 +4498,11 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 { ; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; AVX1-NEXT: vmovq %rdx, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -4510,11 +4510,11 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 { ; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -4645,11 +4645,11 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -4658,19 +4658,19 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx +; AVX512-NEXT: vmovd %edx, %xmm0 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: @@ -4911,7 +4911,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 { ; ; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX1-NEXT: vmovss {{.*#+}} xmm2 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0] ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0] ; AVX1-NEXT: vcomiss %xmm2, %xmm0 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -4921,51 +4921,51 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 { ; AVX1-NEXT: vmovaps %xmm0, %xmm3 ; AVX1-NEXT: .LBB123_2: # %entry ; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vcvttss2si %xmm2, %rax -; AVX1-NEXT: setbe %cl -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: shlq $63, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovss {{.*#+}} xmm3 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] -; AVX1-NEXT: vcomiss %xmm3, %xmm0 -; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvttss2si %xmm2, %rcx +; AVX1-NEXT: setbe %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vmovss {{.*#+}} xmm2 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX1-NEXT: vcomiss %xmm2, %xmm0 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: ja .LBB123_4 ; AVX1-NEXT: # %bb.3: # %entry -; AVX1-NEXT: vmovaps %xmm0, %xmm4 +; AVX1-NEXT: vmovaps %xmm0, %xmm3 ; AVX1-NEXT: .LBB123_4: # %entry -; AVX1-NEXT: vsubss %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vcvttss2si %xmm3, %rax +; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttss2si %xmm2, %rdx ; AVX1-NEXT: setbe %cl ; AVX1-NEXT: movzbl %cl, %ecx ; AVX1-NEXT: shlq $63, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vmovss {{.*#+}} xmm3 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0] -; AVX1-NEXT: vcomiss %xmm3, %xmm0 +; AVX1-NEXT: xorq %rdx, %rcx +; AVX1-NEXT: vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX1-NEXT: vcomiss %xmm2, %xmm0 ; AVX1-NEXT: ja .LBB123_6 ; AVX1-NEXT: # %bb.5: # %entry ; AVX1-NEXT: vmovaps %xmm0, %xmm1 ; AVX1-NEXT: .LBB123_6: # %entry -; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm0 -; AVX1-NEXT: vcvttss2si %xmm0, %rax -; AVX1-NEXT: setbe %cl -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: shlq $63, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vcvttss2si %xmm0, %rdx +; AVX1-NEXT: setbe %sil +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: shlq $63, %rsi +; AVX1-NEXT: xorq %rdx, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -5194,11 +5194,11 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -5207,19 +5207,19 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx +; AVX512-NEXT: vmovd %edx, %xmm0 +; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: @@ -5466,7 +5466,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 { ; ; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0] +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = [4.2299999999999997E+1,0.0E+0] ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [9.2233720368547758E+18,0.0E+0] ; AVX1-NEXT: vcomisd %xmm2, %xmm0 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 @@ -5476,51 +5476,51 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 { ; AVX1-NEXT: vmovapd %xmm0, %xmm3 ; AVX1-NEXT: .LBB131_2: # %entry ; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vcvttsd2si %xmm2, %rax -; AVX1-NEXT: setbe %cl -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: shlq $63, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm2 -; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = [4.2100000000000001E+1,0.0E+0] -; AVX1-NEXT: vcomisd %xmm3, %xmm0 -; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvttsd2si %xmm2, %rcx +; AVX1-NEXT: setbe %al +; AVX1-NEXT: movzbl %al, %eax +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: xorq %rcx, %rax +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = [4.2100000000000001E+1,0.0E+0] +; AVX1-NEXT: vcomisd %xmm2, %xmm0 +; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: ja .LBB131_4 ; AVX1-NEXT: # %bb.3: # %entry -; AVX1-NEXT: vmovapd %xmm0, %xmm4 +; AVX1-NEXT: vmovapd %xmm0, %xmm3 ; AVX1-NEXT: .LBB131_4: # %entry -; AVX1-NEXT: vsubsd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vcvttsd2si %xmm3, %rax +; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcvttsd2si %xmm2, %rdx ; AVX1-NEXT: setbe %cl ; AVX1-NEXT: movzbl %cl, %ecx ; AVX1-NEXT: shlq $63, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = [4.2299999999999997E+1,0.0E+0] -; AVX1-NEXT: vcomisd %xmm3, %xmm0 +; AVX1-NEXT: xorq %rdx, %rcx +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0] +; AVX1-NEXT: vcomisd %xmm2, %xmm0 ; AVX1-NEXT: ja .LBB131_6 ; AVX1-NEXT: # %bb.5: # %entry ; AVX1-NEXT: vmovapd %xmm0, %xmm1 ; AVX1-NEXT: .LBB131_6: # %entry -; AVX1-NEXT: vsubsd %xmm1, %xmm3, %xmm0 -; AVX1-NEXT: vcvttsd2si %xmm0, %rax -; AVX1-NEXT: setbe %cl -; AVX1-NEXT: movzbl %cl, %ecx -; AVX1-NEXT: shlq $63, %rcx -; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vcvttsd2si %xmm0, %rdx +; AVX1-NEXT: setbe %sil +; AVX1-NEXT: movzbl %sil, %esi +; AVX1-NEXT: shlq $63, %rsi +; AVX1-NEXT: xorq %rdx, %rsi +; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx +; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx +; AVX512-NEXT: vmovq %rdx, %xmm0 +; AVX512-NEXT: vmovq %rcx, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax ; AVX512-NEXT: vmovq %rax, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -5731,26 +5731,26 @@ entry: define <3 x float> @constrained_vector_fptrunc_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptrunc_v3f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0] ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1 ; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0 -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: movsd {{.*#+}} xmm1 = [4.2299999999999997E+1,0.0E+0] -; CHECK-NEXT: cvtsd2ss %xmm1, %xmm1 +; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0] +; CHECK-NEXT: cvtsd2ss %xmm2, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptrunc_v3f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0] ; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [4.2100000000000001E+1,0.0E+0] ; AVX-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [4.2299999999999997E+1,0.0E+0] -; AVX-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-NEXT: vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0] +; AVX-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-NEXT: retq entry: %result = call <3 x float> @llvm.experimental.constrained.fptrunc.v3f32.v3f64( @@ -5834,14 +5834,14 @@ define <3 x double> @constrained_vector_fpext_v3f32() #0 { ; ; AVX-LABEL: constrained_vector_fpext_v3f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovss {{.*#+}} xmm1 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0] -; AVX-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32( @@ -6702,14 +6702,14 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; ; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vextractps $2, %xmm0, %eax ; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm1 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vpextrd $1, %xmm0, %eax ; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: retq entry: %result = call <3 x double> @@ -6722,31 +6722,31 @@ entry: define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2ss %eax, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; CHECK-NEXT: movd %xmm2, %eax ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2ss %eax, %xmm2 -; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %eax, %xmm0 -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vextractps $1, %xmm0, %eax +; AVX-NEXT: vextractps $2, %xmm0, %eax ; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm1 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm2 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: vpextrd $1, %xmm0, %eax ; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-NEXT: retq entry: %result = call <3 x float> @@ -6769,28 +6769,28 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; ; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax ; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rax ; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %result = call <3 x double> @@ -6803,39 +6803,38 @@ entry: define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtsi2ss %rsi, %xmm1 -; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2ss %rdx, %xmm1 +; CHECK-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-NEXT: cvtsi2ss %rsi, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax ; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rax ; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: @@ -7415,26 +7414,26 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vextractps $2, %xmm0, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vpextrd $1, %xmm0, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vextractps $2, %xmm0, %eax ; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm1 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm2 -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vpextrd $1, %xmm0, %eax ; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm0 -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %result = call <3 x double> @@ -7447,43 +7446,43 @@ entry: define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; CHECK-NEXT: movd %xmm2, %eax ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsi2ss %rax, %xmm2 -; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cvtsi2ss %rax, %xmm0 -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractps $1, %xmm0, %eax +; AVX1-NEXT: vextractps $2, %xmm0, %eax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: vpextrd $1, %xmm0, %eax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vextractps $1, %xmm0, %eax +; AVX512-NEXT: vextractps $2, %xmm0, %eax ; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm1 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512-NEXT: vpextrd $2, %xmm0, %eax +; AVX512-NEXT: vpextrd $1, %xmm0, %eax ; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX512-NEXT: retq entry: %result = call <3 x float> @@ -7539,7 +7538,8 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: movl %eax, %edx @@ -7565,9 +7565,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: # %bb.3: ; AVX1-NEXT: vaddsd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB183_4: # %entry -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: movl %eax, %edx @@ -7580,20 +7578,21 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: # %bb.5: ; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: .LBB183_6: # %entry -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax ; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rax ; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %result = call <3 x double> @@ -7606,13 +7605,13 @@ entry: define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edx, %ecx ; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: testq %rsi, %rsi -; CHECK-NEXT: cmovnsq %rsi, %rcx +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovnsq %rdx, %rcx ; CHECK-NEXT: cvtsi2ss %rcx, %xmm1 ; CHECK-NEXT: jns .LBB184_2 ; CHECK-NEXT: # %bb.1: @@ -7630,26 +7629,26 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: addss %xmm0, %xmm0 ; CHECK-NEXT: .LBB184_4: # %entry -; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: shrq %rax -; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: cmovnsq %rdx, %rcx -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: cvtsi2ss %rcx, %xmm1 +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovnsq %rsi, %rcx +; CHECK-NEXT: cvtsi2ss %rcx, %xmm2 ; CHECK-NEXT: jns .LBB184_6 ; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: addss %xmm1, %xmm1 +; CHECK-NEXT: addss %xmm2, %xmm2 ; CHECK-NEXT: .LBB184_6: # %entry +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: movl %eax, %edx @@ -7675,9 +7674,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: # %bb.3: ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB184_4: # %entry -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: movl %eax, %edx @@ -7690,21 +7687,22 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: # %bb.5: ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: .LBB184_6: # %entry -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpextrq $1, %xmm0, %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %rax ; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rax ; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index 304daab..2e85a4e 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -319,9 +319,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -333,8 +333,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u] ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -345,8 +345,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,u,u] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,u,u] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -411,9 +411,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index ae5dd18..8db5414 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -499,11 +499,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-NEXT: psrld $28, %xmm1 ; SSE2-NEXT: psrld $27, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE2-NEXT: pslld $4, %xmm0 +; SSE2-NEXT: pslld $5, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -514,7 +512,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE41-NEXT: psrld $27, %xmm2 ; SSE41-NEXT: psrld $28, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $5, %xmm1 +; SSE41-NEXT: pslld $4, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -523,7 +524,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1] +; AVX1-NEXT: vpslld $5, %xmm0, %xmm2 +; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -597,11 +600,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; X86-SSE2-NEXT: psrld $28, %xmm1 ; X86-SSE2-NEXT: psrld $27, %xmm2 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pslld $4, %xmm0 +; X86-SSE2-NEXT: pslld $5, %xmm2 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 33a6a76..30205259 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1989,11 +1989,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; SSE2-NEXT: paddb %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,1,2,4,8,16,32,64] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,u,1,u,2,u,4,u,8,u,16,u,32,u,64,u] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,u,64,u,32,u,16,u,8,u,4,u,2,u,1,u] ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 @@ -2014,7 +2014,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -2033,7 +2033,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -2149,11 +2149,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [128,1,2,4,8,16,32,64] +; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [128,u,1,u,2,u,4,u,8,u,16,u,32,u,64,u] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [128,64,32,16,8,4,2,1] +; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [128,u,64,u,32,u,16,u,8,u,4,u,2,u,1,u] ; X86-SSE2-NEXT: pand %xmm3, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 217431be..0cffa1b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1631,9 +1631,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [128,32,8,2,128,2,8,32] -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] +; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1653,7 +1653,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; AVX2-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1672,7 +1672,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1690,7 +1690,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] ; AVX512VL-NEXT: vpsllw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | ymm1 | ymm2 ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 3a522cc..25f8f94 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -915,10 +915,10 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3 @@ -957,10 +957,10 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VL-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64] ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index 4b42b18..17bbfa1 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -341,9 +341,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -355,8 +355,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u] ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -367,8 +367,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,134217728,u,u] +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,u,u] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] @@ -433,9 +433,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u] +; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,134217728,u,u] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 2d8670a..144e77b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -497,42 +497,35 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> % define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-LABEL: constant_funnnel_v2i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE2-NEXT: psrld $4, %xmm1 ; SSE2-NEXT: psrld $5, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $4, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE2-NEXT: pslld $28, %xmm0 -; SSE2-NEXT: pslld $27, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pslld $27, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrld $5, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrld $4, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; SSE41-NEXT: psrld $4, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pslld $27, %xmm1 ; SSE41-NEXT: pslld $28, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpsrld $4, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpslld $27, %xmm0, %xmm2 ; AVX1-NEXT: vpslld $28, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] @@ -606,17 +599,15 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; X86-SSE2-NEXT: psrld $4, %xmm1 ; X86-SSE2-NEXT: psrld $5, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: psrld $4, %xmm3 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: pslld $28, %xmm0 -; X86-SSE2-NEXT: pslld $27, %xmm1 -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: por %xmm3, %xmm0 +; X86-SSE2-NEXT: pslld $27, %xmm2 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 5>) ret <2 x i32> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index e68d1d7..3117865 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -691,11 +691,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: psubb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [14,13,12,11,10,9,9,7] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [14,u,13,u,12,u,11,u,10,u,9,u,9,u,7,u] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,8,9,10,11,12,13,14] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,u,8,u,9,u,10,u,11,u,12,u,13,u,14,u] ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 @@ -731,7 +731,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: por %xmm1, %xmm2 ; SSE41-NEXT: psubb %xmm2, %xmm0 @@ -762,7 +762,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index 7355f36..fa5692a 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -660,7 +660,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm5 ; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] @@ -686,7 +686,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22] ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 @@ -720,7 +720,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 5445330..b11756a 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -544,7 +544,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm5 ; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] @@ -570,7 +570,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38] ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 @@ -603,7 +603,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpmovb2m %zmm1, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm2 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst) diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 6cd5098..cbc2b96 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -787,13 +787,13 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,32,32,32,128,128,64] ; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [14,13,12,11,10,9,9,7] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [14,u,13,u,12,u,11,u,10,u,9,u,9,u,7,u] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64] ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,u,8,u,9,u,10,u,11,u,12,u,13,u,14,u] ; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: packuswb %xmm3, %xmm2 ; SSE2-NEXT: psubb %xmm2, %xmm0 @@ -840,7 +840,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: por %xmm1, %xmm2 ; SSE41-NEXT: psubb %xmm2, %xmm0 @@ -882,7 +882,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7] ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index 98ea87c..ca57359 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -702,7 +702,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,16,16,128,64,16,256,32] ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] @@ -739,7 +739,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,256,128,32,32,32,64,64] ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22] ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22] ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 @@ -781,7 +781,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index a11fa370..b8a131e 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -575,7 +575,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32] ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm4 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] @@ -609,7 +609,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,16,16,64,32,128,256,16,16] ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0] +; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38] ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38] ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 @@ -648,7 +648,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0] +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7] ; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index dbb4b9f..e0410ae 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -84,11 +84,11 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -96,8 +96,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm1, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -105,8 +105,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm1, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -114,8 +114,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -123,17 +123,17 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride2_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rdx) ; AVX512-NEXT: retq %wide.vec = load <8 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index da902b3..c932482 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -196,18 +196,18 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: movq %xmm1, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -217,14 +217,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm2, (%rsi) +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm2, (%rdx) ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -234,14 +234,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -251,13 +251,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; @@ -267,13 +267,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; @@ -283,14 +283,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%rdx) ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: retq ; @@ -300,13 +300,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: retq ; @@ -316,14 +316,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; @@ -333,13 +333,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: retq ; @@ -348,15 +348,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -365,13 +366,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11] +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -380,15 +381,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3] +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -397,13 +399,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 01aacc1..d4e5d4c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -220,20 +220,20 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movq %xmm5, (%rsi) ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%rcx) ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) -; SSE-NEXT: movq %xmm4, (%rcx) ; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; @@ -246,23 +246,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX-NEXT: vmovq %xmm2, (%rcx) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm4, (%rcx) -; AVX-NEXT: vmovq %xmm1, (%r8) +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq %xmm0, (%r8) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i16_stride4_vf4: @@ -274,23 +274,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm1, (%r8) +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-NEXT: vmovq %xmm0, (%r8) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i16_stride4_vf4: @@ -302,22 +302,22 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i16_stride4_vf4: @@ -329,125 +329,125 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride4_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512-NEXT: vpmovqw %ymm3, (%r8) +; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqw %ymm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride4_vf4: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride4_vf4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf4: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride4_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512BW-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512BW-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512BW-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512BW-NEXT: vpmovqw %ymm3, (%r8) +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovqw %ymm0, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride4_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride4_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 9b19ec1..8fb6222 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -288,55 +288,55 @@ define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,0,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movq %xmm1, (%rsi) -; SSE-NEXT: movq %xmm4, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm7, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,0,3,4,5,6,7] +; SSE-NEXT: movq %xmm3, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: movq %xmm3, (%r8) +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movq %xmm3, (%r9) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride5_vf4: @@ -349,30 +349,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX-NEXT: vpsrlq $48, %xmm2, %xmm4 -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm1, (%r9) +; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, (%r8) +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] +; AVX-NEXT: vmovq %xmm0, (%r9) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i16_stride5_vf4: @@ -385,22 +385,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rcx) +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm5, (%rcx) -; AVX2-NEXT: vmovq %xmm6, (%r8) ; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: retq ; @@ -412,22 +412,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: retq ; @@ -439,58 +439,64 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i16_stride5_vf4: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpextrw $5, %xmm0, %eax -; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512-NEXT: vpextrw $6, %xmm0, %eax -; AVX512-NEXT: vpextrw $1, %xmm0, %r10d -; AVX512-NEXT: vmovd %r10d, %xmm4 -; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 -; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpextrw $5, %xmm0, %r10d +; AVX512-NEXT: vmovd %xmm2, %r11d +; AVX512-NEXT: vpextrw $3, %xmm1, %ebx +; AVX512-NEXT: vpextrw $6, %xmm0, %ebp +; AVX512-NEXT: vpextrw $1, %xmm0, %r14d +; AVX512-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vmovd %r14d, %xmm1 +; AVX512-NEXT: vpinsrw $1, %ebp, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $3, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%r8) +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) -; AVX512-NEXT: vmovq %xmm6, (%r8) ; AVX512-NEXT: vmovq %xmm0, (%r9) +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride5_vf4: @@ -498,65 +504,71 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpextrw $7, %xmm1, %eax -; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %eax -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovd %xmm2, %eax +; AVX512-FCP-NEXT: vmovd %xmm2, %r10d +; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %r11d +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpinsrw $3, %r10d, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride5_vf4: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbp +; AVX512DQ-NEXT: pushq %r14 +; AVX512DQ-NEXT: pushq %rbx ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpextrw $5, %xmm0, %eax -; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpextrw $7, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpextrw $6, %xmm0, %eax -; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r10d -; AVX512DQ-NEXT: vmovd %r10d, %xmm4 -; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpextrw $3, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 -; AVX512DQ-NEXT: vmovd %xmm2, %eax +; AVX512DQ-NEXT: vpextrw $5, %xmm0, %r10d +; AVX512DQ-NEXT: vmovd %xmm2, %r11d +; AVX512DQ-NEXT: vpextrw $3, %xmm1, %ebx +; AVX512DQ-NEXT: vpextrw $6, %xmm0, %ebp +; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r14d +; AVX512DQ-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vmovd %r14d, %xmm1 +; AVX512DQ-NEXT: vpinsrw $1, %ebp, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $3, %r11d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%r8) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-NEXT: popq %rbx +; AVX512DQ-NEXT: popq %r14 +; AVX512DQ-NEXT: popq %rbp ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4: @@ -564,29 +576,29 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpextrw $7, %xmm1, %eax -; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %eax -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax +; AVX512DQ-FCP-NEXT: vmovd %xmm2, %r10d +; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %r11d +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpinsrw $3, %r10d, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: retq ; @@ -600,19 +612,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-NEXT: movl 32(%rdi), %edi ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -626,19 +639,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512BW-FCP-NEXT: movl 32(%rdi), %edi ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -652,19 +666,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-NEXT: movl 32(%rdi), %edi ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -678,19 +693,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax +; AVX512DQ-BW-FCP-NEXT: movl 32(%rdi), %edi ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index feb75b2..dc8a9ed 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -382,57 +382,57 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movq %xmm2, (%rsi) ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movq %xmm7, (%rcx) +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movq %xmm2, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movq %xmm3, (%r9) ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) -; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm2, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) ; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -448,32 +448,32 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX-NEXT: vpsrld $16, %xmm1, %xmm5 +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpsrld $16, %xmm1, %xmm4 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm2[4,5],xmm4[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%r8) +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX-NEXT: vmovq %xmm3, (%r9) ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm7, (%r9) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -486,24 +486,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm5, (%r8) ; AVX2-NEXT: vmovq %xmm1, (%r9) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -516,23 +516,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm5, (%r8) ; AVX2-FP-NEXT: vmovq %xmm1, (%r9) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -545,23 +545,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; @@ -574,26 +574,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovq %xmm3, (%rsi) ; AVX512-NEXT: vmovq %xmm0, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) -; AVX512-NEXT: vmovq %xmm1, (%r8) -; AVX512-NEXT: vmovq %xmm5, (%r9) -; AVX512-NEXT: vmovq %xmm2, (%rax) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%r8) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm1, (%r9) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -606,25 +606,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -637,26 +637,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -669,25 +669,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -697,22 +697,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -722,22 +722,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -747,22 +747,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -772,22 +772,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 038c73b..e89248a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -418,77 +418,77 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3] +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movq %xmm7, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: psrlq $16, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] -; SSE-NEXT: pslld $16, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movq %xmm7, (%rcx) +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrlq $16, %xmm6 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm0, (%rdx) -; SSE-NEXT: movq %xmm7, (%rcx) -; SSE-NEXT: movq %xmm8, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) -; SSE-NEXT: movq %xmm10, (%rdi) -; SSE-NEXT: movq %xmm1, (%rax) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movq %xmm5, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%r9) +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movq %xmm5, (%rcx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride7_vf4: @@ -497,54 +497,54 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7] -; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,3] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX-NEXT: vpslld $16, %xmm2, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] -; AVX-NEXT: vpsrlq $16, %xmm4, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX-NEXT: vpsrlq $48, %xmm1, %xmm10 -; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vpsrld $16, %xmm0, %xmm10 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vpslld $16, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vpsrlq $16, %xmm3, %xmm4 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7] +; AVX-NEXT: vpsrlq $48, %xmm1, %xmm6 +; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm3, (%r10) ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm7, (%rcx) -; AVX-NEXT: vmovq %xmm8, (%r8) -; AVX-NEXT: vmovq %xmm9, (%r9) -; AVX-NEXT: vmovq %xmm4, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -552,51 +552,51 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7] +; AVX2-NEXT: vmovq %xmm5, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX2-NEXT: vmovq %xmm5, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm2, (%r8) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vmovq %xmm2, (%r10) +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-NEXT: vmovq %xmm1, (%rsi) -; AVX2-NEXT: vmovq %xmm6, (%rdx) -; AVX2-NEXT: vmovq %xmm3, (%rcx) -; AVX2-NEXT: vmovq %xmm4, (%r8) -; AVX2-NEXT: vmovq %xmm5, (%r9) -; AVX2-NEXT: vmovq %xmm7, (%r10) +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -605,8 +605,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -615,37 +615,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm5, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FP-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -654,8 +654,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -664,37 +664,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -708,47 +708,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512-NEXT: vmovq %xmm5, (%rsi) +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX512-NEXT: vmovq %xmm5, (%rdx) ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) ; AVX512-NEXT: vmovq %xmm2, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) -; AVX512-NEXT: vmovq %xmm6, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) -; AVX512-NEXT: vmovq %xmm3, (%rax) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm2, (%r8) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovq %xmm2, (%r10) +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -756,48 +756,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -810,47 +810,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovq %xmm5, (%rsi) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; AVX512DQ-NEXT: vmovq %xmm5, (%rdx) ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-NEXT: vmovq %xmm3, (%rax) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -858,48 +858,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -910,25 +910,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm6, (%r10) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -939,25 +939,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -968,25 +968,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -997,25 +997,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <28 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index fff21f9..b249950 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -296,41 +296,41 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3] +; SSE-NEXT: movq %xmm6, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movq %xmm7, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movq %xmm5, (%rcx) ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movq %xmm6, (%rsi) -; SSE-NEXT: movq %xmm8, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm7, (%r8) -; SSE-NEXT: movq %xmm1, (%r9) -; SSE-NEXT: movq %xmm4, (%r11) +; SSE-NEXT: movq %xmm3, (%r11) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movq %xmm0, (%r10) -; SSE-NEXT: movq %xmm3, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movq %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i16_stride8_vf4: @@ -345,28 +345,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vmovq %xmm6, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm6, (%rdx) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX-NEXT: vmovq %xmm6, (%rcx) ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm6, (%rsi) -; AVX-NEXT: vmovq %xmm7, (%rdx) -; AVX-NEXT: vmovq %xmm8, (%rcx) -; AVX-NEXT: vmovq %xmm4, (%r8) ; AVX-NEXT: vmovq %xmm1, (%r9) -; AVX-NEXT: vmovq %xmm3, (%r11) -; AVX-NEXT: vmovq %xmm5, (%r10) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vmovq %xmm1, (%r11) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vmovq %xmm1, (%r10) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -382,28 +382,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-NEXT: vmovq %xmm6, (%rsi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-NEXT: vmovq %xmm6, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-NEXT: vmovq %xmm6, (%rcx) ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r8) ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm6, (%rsi) -; AVX2-NEXT: vmovq %xmm7, (%rdx) -; AVX2-NEXT: vmovq %xmm8, (%rcx) -; AVX2-NEXT: vmovq %xmm4, (%r8) ; AVX2-NEXT: vmovq %xmm1, (%r9) -; AVX2-NEXT: vmovq %xmm3, (%r11) -; AVX2-NEXT: vmovq %xmm5, (%r10) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-NEXT: vmovq %xmm1, (%r11) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vmovq %xmm1, (%r10) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -419,28 +419,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FP-NEXT: vmovq %xmm6, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm7, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm8, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FP-NEXT: vmovq %xmm1, (%r9) -; AVX2-FP-NEXT: vmovq %xmm3, (%r11) -; AVX2-FP-NEXT: vmovq %xmm5, (%r10) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FP-NEXT: vmovq %xmm1, (%r11) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FP-NEXT: vmovq %xmm1, (%r10) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -456,28 +456,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm8, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r11) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r10) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovq %xmm1, (%r11) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: retq ; @@ -493,25 +493,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] -; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 +; AVX512-NEXT: vmovq %xmm6, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512-NEXT: vmovq %xmm6, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-NEXT: vmovq %xmm6, (%rcx) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3] +; AVX512-NEXT: vpermt2d %xmm4, %xmm6, %xmm5 +; AVX512-NEXT: vmovq %xmm5, (%r8) ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-NEXT: vpermt2d %xmm2, %xmm9, %xmm0 -; AVX512-NEXT: vmovq %xmm6, (%rsi) -; AVX512-NEXT: vmovq %xmm7, (%rdx) -; AVX512-NEXT: vmovq %xmm8, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) ; AVX512-NEXT: vmovq %xmm1, (%r9) -; AVX512-NEXT: vmovq %xmm3, (%r11) -; AVX512-NEXT: vmovq %xmm4, (%r10) +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512-NEXT: vmovq %xmm1, (%r11) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-NEXT: vmovq %xmm1, (%r10) +; AVX512-NEXT: vpermt2d %xmm2, %xmm6, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: retq ; @@ -527,25 +527,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] -; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] -; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 +; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1] +; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm6, %xmm7 +; AVX512-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-FCP-NEXT: vmovq %xmm7, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3] +; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm9, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512-FCP-NEXT: vmovq %xmm3, (%r10) +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm6 +; AVX512-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm7, %xmm0 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: retq ; @@ -561,25 +561,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3] -; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm6, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512DQ-NEXT: vmovq %xmm6, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-NEXT: vmovq %xmm6, (%rcx) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3] +; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm6, %xmm5 +; AVX512DQ-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm9, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm6, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm7, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm8, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-NEXT: vmovq %xmm3, (%r11) -; AVX512DQ-NEXT: vmovq %xmm4, (%r10) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512DQ-NEXT: vmovq %xmm1, (%r11) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-NEXT: vmovq %xmm1, (%r10) +; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm6, %xmm0 ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: retq ; @@ -595,25 +595,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3] -; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5 +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm7 +; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm6, %xmm7 +; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3] +; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm9, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r11) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r10) +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r11) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm7, %xmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: retq ; @@ -625,28 +625,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-NEXT: vmovq %xmm6, (%r11) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r11) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -658,28 +658,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r11) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -691,28 +691,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r11) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -724,28 +724,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <32 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index f2c5a91..995d641 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -20,8 +20,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: movq %xmm1, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -29,8 +29,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovlps %xmm1, (%rsi) +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovlps %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -38,8 +38,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -47,8 +47,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FP-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -56,8 +56,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; @@ -65,8 +65,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512: # %bb.0: ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vmovlps %xmm1, (%rsi) +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-NEXT: vmovlps %xmm0, (%rdx) ; AVX512-NEXT: retq ; @@ -74,8 +74,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512-FCP-NEXT: retq ; @@ -83,8 +83,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-NEXT: retq ; @@ -92,8 +92,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-FCP-NEXT: retq ; @@ -101,8 +101,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-NEXT: vmovlps %xmm1, (%rsi) +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-NEXT: vmovlps %xmm0, (%rdx) ; AVX512BW-NEXT: retq ; @@ -110,8 +110,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: retq ; @@ -119,8 +119,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: retq ; @@ -128,8 +128,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <4 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 34f2321..8af9594 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -21,13 +21,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movq %xmm2, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -36,12 +36,12 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%rdx) ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: vmovlps %xmm2, (%rsi) -; AVX-NEXT: vmovlps %xmm3, (%rdx) ; AVX-NEXT: vmovlps %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -50,13 +50,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i32_stride3_vf2: @@ -64,13 +64,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FP-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i32_stride3_vf2: @@ -78,13 +78,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm1, (%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovlps %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i32_stride3_vf2: @@ -92,13 +92,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovaps (%rdi), %xmm0 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512-NEXT: vmovlps %xmm2, (%rsi) +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512-NEXT: vmovlps %xmm0, (%rdx) -; AVX512-NEXT: vmovlps %xmm1, (%rcx) +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vmovlps %xmm0, (%rcx) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i32_stride3_vf2: @@ -119,13 +119,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512DQ-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512DQ-NEXT: vmovlps %xmm2, (%rsi) +; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx) -; AVX512DQ-NEXT: vmovlps %xmm1, (%rcx) +; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vmovlps %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2: @@ -146,13 +146,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512BW-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512BW-NEXT: vmovlps %xmm2, (%rsi) +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512BW-NEXT: vmovlps %xmm0, (%rdx) -; AVX512BW-NEXT: vmovlps %xmm1, (%rcx) +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-NEXT: vmovlps %xmm0, (%rcx) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i32_stride3_vf2: @@ -173,13 +173,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-BW-NEXT: vbroadcastss 8(%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 822d31e..f7ddcfc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -22,13 +22,13 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: movq %xmm2, (%rdx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride4_vf2: @@ -36,11 +36,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vmovq %xmm2, (%rdx) +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX-NEXT: retq @@ -50,11 +50,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-NEXT: retq @@ -64,11 +64,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-FP-NEXT: retq @@ -78,11 +78,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX2-FCP-NEXT: retq @@ -92,11 +92,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512-NEXT: retq @@ -108,9 +108,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper @@ -121,11 +121,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-NEXT: retq @@ -137,9 +137,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -150,11 +150,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512BW-NEXT: retq @@ -166,9 +166,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper @@ -179,11 +179,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-BW-NEXT: retq @@ -195,9 +195,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1] ; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 4f80140..fea8ebd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -24,19 +24,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rsi) ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movq %xmm4, (%rdx) ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movq %xmm1, (%r9) ; SSE-NEXT: retq ; @@ -46,16 +46,16 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vpextrq $1, %xmm5, (%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX-NEXT: vmovq %xmm0, (%r8) -; AVX-NEXT: vmovq %xmm1, (%r9) +; AVX-NEXT: vmovq %xmm4, (%r9) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride5_vf2: @@ -64,17 +64,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-NEXT: vmovq %xmm0, (%r8) -; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -84,17 +84,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FP-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-FP-NEXT: vmovq %xmm0, (%r8) -; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -104,17 +104,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vpextrq $1, %xmm1, (%rcx) +; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -123,21 +123,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm4, (%rdx) +; AVX512-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512-NEXT: vmovq %xmm1, (%rcx) +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512-NEXT: vmovq %xmm0, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -146,19 +146,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -167,21 +167,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-NEXT: vpextrd $2, %xmm1, %eax -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-NEXT: vpextrd $3, %xmm1, %eax -; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -190,19 +190,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -211,21 +211,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm4, (%rdx) +; AVX512BW-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -234,19 +234,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -255,21 +255,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm1, %eax -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %eax -; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx) +; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %r10d +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) +; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -278,19 +278,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <10 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 85ed618..49b1318 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -18,31 +18,31 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE-NEXT: movq %xmm1, (%rsi) -; SSE-NEXT: movq %xmm4, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm3, (%rdx) +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movq %xmm5, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movq %xmm0, (%r9) -; SSE-NEXT: movq %xmm7, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movq %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride6_vf2: @@ -53,22 +53,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3] ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rdx) +; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3] +; AVX-NEXT: vmovlps %xmm3, (%rcx) ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3] -; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; AVX-NEXT: vmovlps %xmm3, (%rsi) -; AVX-NEXT: vmovlps %xmm4, (%rdx) -; AVX-NEXT: vmovlps %xmm5, (%rcx) ; AVX-NEXT: vmovlps %xmm0, (%r8) -; AVX-NEXT: vmovlps %xmm6, (%r9) -; AVX-NEXT: vmovlps %xmm1, (%rax) +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,2,3,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: vmovlps %xmm0, (%r9) +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i32_stride6_vf2: @@ -80,22 +80,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-NEXT: vmovlps %xmm1, (%r8) -; AVX2-NEXT: vmovlps %xmm3, (%r9) +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vmovlps %xmm1, (%r9) +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -109,22 +109,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-FP-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-FP-NEXT: vmovlps %xmm1, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r9) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FP-NEXT: vmovlps %xmm1, (%r9) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -138,54 +138,56 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rcx) ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] -; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm5, (%rcx) ; AVX2-FCP-NEXT: vmovlps %xmm1, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovlps %xmm1, (%r9) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0] +; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i32_stride6_vf2: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-NEXT: vextractps $2, %xmm1, %r10d -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512-NEXT: vextractps $3, %xmm1, %r10d +; AVX512-NEXT: vextractps $3, %xmm1, %r11d +; AVX512-NEXT: vmovd %xmm2, %ebx +; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovq %xmm1, (%rsi) ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-NEXT: vmovd %xmm2, %r10d -; AVX512-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rdx) +; AVX512-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512-NEXT: vmovq %xmm1, (%rcx) ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm1, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm2, (%r9) -; AVX512-NEXT: vmovlps %xmm5, (%rax) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vmovlps %xmm0, (%r9) +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -195,56 +197,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i32_stride6_vf2: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: pushq %rbx ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-NEXT: vextractps $2, %xmm1, %r10d -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512DQ-NEXT: vextractps $3, %xmm1, %r10d +; AVX512DQ-NEXT: vextractps $3, %xmm1, %r11d +; AVX512DQ-NEXT: vmovd %xmm2, %ebx +; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-NEXT: vmovd %xmm2, %r10d -; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512DQ-NEXT: vmovq %xmm1, (%rcx) ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm2, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) +; AVX512DQ-NEXT: popq %rbx ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -254,56 +258,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride6_vf2: ; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %rbx ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-NEXT: vextractps $2, %xmm1, %r10d -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512BW-NEXT: vextractps $3, %xmm1, %r10d +; AVX512BW-NEXT: vextractps $3, %xmm1, %r11d +; AVX512BW-NEXT: vmovd %xmm2, %ebx +; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512BW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-NEXT: vmovd %xmm2, %r10d -; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, (%rcx) ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm2, (%r9) -; AVX512BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -313,56 +319,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i32_stride6_vf2: ; AVX512DQ-BW: # %bb.0: +; AVX512DQ-BW-NEXT: pushq %rbx ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vextractps $2, %xmm1, %r10d -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 -; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r10d +; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r11d +; AVX512DQ-BW-NEXT: vmovd %xmm2, %ebx +; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1 +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vmovd %xmm2, %r10d -; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 +; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx) ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0] -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0] -; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-BW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) +; AVX512DQ-BW-NEXT: popq %rbx ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -372,25 +380,25 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0] ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <12 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 7948141..64ddca7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -18,35 +18,35 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movq %xmm1, (%rsi) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movq %xmm0, (%rsi) ; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: movq %xmm5, (%rcx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm1, (%r9) -; SSE-NEXT: movq %xmm3, (%r10) -; SSE-NEXT: movq %xmm7, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movq %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride7_vf2: @@ -60,26 +60,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX-NEXT: vmovlps %xmm5, (%rsi) +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] +; AVX-NEXT: vmovlps %xmm5, (%rdx) +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX-NEXT: vmovlps %xmm5, (%rcx) ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] -; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] -; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r8) +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX-NEXT: vmovlps %xmm2, (%r9) +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r10) ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-NEXT: vmovlps %xmm5, (%rsi) -; AVX-NEXT: vmovlps %xmm6, (%rdx) -; AVX-NEXT: vmovlps %xmm7, (%rcx) -; AVX-NEXT: vmovlps %xmm2, (%r8) -; AVX-NEXT: vmovlps %xmm3, (%r9) -; AVX-NEXT: vmovlps %xmm4, (%r10) ; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -94,27 +94,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%r8) +; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovlps %xmm2, (%r9) +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-NEXT: vmovlps %xmm2, (%r10) ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-NEXT: vmovlps %xmm3, (%r8) -; AVX2-NEXT: vmovlps %xmm4, (%r9) -; AVX2-NEXT: vmovlps %xmm7, (%r10) ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -129,27 +129,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%r8) +; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FP-NEXT: vmovlps %xmm2, (%r10) ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-FP-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm4, (%r9) -; AVX2-FP-NEXT: vmovlps %xmm7, (%r10) ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -164,27 +164,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] -; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] +; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%rcx) +; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r8) +; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3] +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r10) ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovlps %xmm5, (%rdx) -; AVX2-FCP-NEXT: vmovlps %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovlps %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -195,31 +195,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512-NEXT: vmovd %xmm1, %r11d -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512-NEXT: vmovaps (%rdi), %ymm6 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rcx) ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512-NEXT: vmovaps (%rdi), %ymm5 -; AVX512-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) ; AVX512-NEXT: vmovq %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm1, (%r9) -; AVX512-NEXT: vmovlps %xmm7, (%r10) -; AVX512-NEXT: vmovlps %xmm5, (%rax) +; AVX512-NEXT: vmovlps %xmm4, (%r9) +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512-NEXT: vmovlps %xmm0, (%r10) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -231,24 +231,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -259,31 +259,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512DQ-NEXT: vmovd %xmm1, %r11d -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512DQ-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512DQ-NEXT: vmovaps (%rdi), %ymm6 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5 -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512DQ-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm7, (%r10) -; AVX512DQ-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512DQ-NEXT: vmovlps %xmm0, (%r10) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -295,24 +295,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -323,31 +323,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512BW-NEXT: vmovd %xmm1, %r11d -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512BW-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512BW-NEXT: vmovaps (%rdi), %ymm6 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rcx) ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovaps (%rdi), %ymm5 -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512BW-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-NEXT: vmovlps %xmm7, (%r10) -; AVX512BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512BW-NEXT: vmovlps %xmm4, (%r9) +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512BW-NEXT: vmovlps %xmm0, (%r10) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -359,24 +359,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -387,31 +387,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 ; AVX512DQ-BW-NEXT: vmovd %xmm1, %r11d -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2 +; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0] +; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] -; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3] -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm7, (%r10) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax) +; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r9) +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r10) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -423,24 +423,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 13410fb..a118b40 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -27,22 +27,22 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movq %xmm0, (%rcx) -; SSE-NEXT: movq %xmm2, (%r8) -; SSE-NEXT: movq %xmm6, (%r9) -; SSE-NEXT: movq %xmm7, (%r11) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%r11) +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movq %xmm1, (%r10) -; SSE-NEXT: movq %xmm3, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i32_stride8_vf2: @@ -55,26 +55,26 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX-NEXT: vmovq %xmm2, (%rcx) +; AVX-NEXT: vpextrq $1, %xmm2, (%r8) +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vmovlps %xmm2, (%r9) +; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX-NEXT: vmovlps %xmm2, (%r11) +; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vmovlps %xmm2, (%r10) ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm2, (%rcx) -; AVX-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX-NEXT: vmovlps %xmm3, (%r9) -; AVX-NEXT: vmovlps %xmm6, (%r11) -; AVX-NEXT: vmovlps %xmm7, (%r10) ; AVX-NEXT: vmovlps %xmm0, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -84,30 +84,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm5, (%rdx) +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vmovq %xmm2, (%rcx) ; AVX2-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-NEXT: vmovlps %xmm3, (%r9) -; AVX2-NEXT: vmovlps %xmm6, (%r11) +; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovlps %xmm2, (%r9) +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovlps %xmm2, (%r11) +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovlps %xmm1, (%r10) +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovlps %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -117,30 +117,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) ; AVX2-FP-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-FP-NEXT: vmovlps %xmm3, (%r9) -; AVX2-FP-NEXT: vmovlps %xmm6, (%r11) +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vmovlps %xmm2, (%r11) +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vmovlps %xmm1, (%r10) +; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -150,30 +150,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) ; AVX2-FCP-NEXT: vpextrq $1, %xmm2, (%r8) -; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9) -; AVX2-FCP-NEXT: vmovlps %xmm6, (%r11) +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r9) +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vmovlps %xmm2, (%r11) +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -186,28 +186,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512-NEXT: vmovlps %xmm5, (%r9) -; AVX512-NEXT: vmovlps %xmm6, (%r11) -; AVX512-NEXT: vmovlps %xmm4, (%r10) -; AVX512-NEXT: vmovlps %xmm1, (%rax) +; AVX512-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%r9) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%r11) +; AVX512-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovlps %xmm1, (%r10) +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovlps %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -219,27 +219,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -251,28 +251,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%r11) +; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -284,27 +284,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -316,28 +316,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512BW-NEXT: vmovlps %xmm5, (%r9) -; AVX512BW-NEXT: vmovlps %xmm6, (%r11) -; AVX512BW-NEXT: vmovlps %xmm4, (%r10) -; AVX512BW-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%r11) +; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -349,27 +349,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512BW-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512BW-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -381,28 +381,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] -; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-BW-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r11) +; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -414,27 +414,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm5, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm6, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%r11) +; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 81fe19c..b609299 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -280,9 +280,9 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: retq ; @@ -290,8 +290,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm1, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -299,8 +299,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm1, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rdx) ; AVX2-NEXT: retq ; @@ -308,8 +308,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FP-NEXT: retq ; @@ -317,8 +317,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX2-FCP-NEXT: retq ; @@ -326,8 +326,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm1, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovq %xmm0, (%rdx) ; AVX512-NEXT: retq ; @@ -335,8 +335,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512-FCP-NEXT: retq ; @@ -344,8 +344,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-NEXT: retq ; @@ -353,41 +353,41 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride2_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride2_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride2_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride2_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <16 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <16 x i8> %wide.vec, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index d1d7cb0..a238371 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -378,39 +378,39 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm5, (%rdx) ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -421,14 +421,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vmovq %xmm2, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovq %xmm2, (%rdx) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm2, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) ; AVX-NEXT: vmovq %xmm0, (%rcx) ; AVX-NEXT: retq ; @@ -439,14 +439,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) ; AVX2-NEXT: vmovq %xmm0, (%rcx) ; AVX2-NEXT: retq ; @@ -457,14 +457,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FP-NEXT: retq ; @@ -475,14 +475,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX2-FCP-NEXT: retq ; @@ -493,14 +493,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) ; AVX512-NEXT: vmovq %xmm0, (%rcx) ; AVX512-NEXT: retq ; @@ -511,14 +511,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512-FCP-NEXT: retq ; @@ -529,14 +529,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-NEXT: retq ; @@ -547,14 +547,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-FCP-NEXT: retq ; @@ -565,14 +565,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-NEXT: retq ; @@ -583,14 +583,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512BW-FCP-NEXT: retq ; @@ -601,14 +601,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-NEXT: retq ; @@ -619,14 +619,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <24 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index abef980..1dff9f4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -409,62 +409,62 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,3] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movq %xmm3, (%rsi) +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: movq %xmm6, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride4_vf8: @@ -475,22 +475,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vmovq %xmm3, (%rdx) -; AVX-NEXT: vmovq %xmm4, (%rcx) -; AVX-NEXT: vmovq %xmm1, (%r8) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX-NEXT: vmovq %xmm0, (%rdx) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX-NEXT: vmovq %xmm0, (%rcx) +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX-NEXT: vmovq %xmm0, (%r8) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride4_vf8: @@ -501,22 +501,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm1, (%r8) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vmovq %xmm0, (%rdx) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vmovq %xmm0, (%rcx) +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vmovq %xmm0, (%r8) ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf8: @@ -527,22 +527,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-FP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%rcx) +; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf8: @@ -553,125 +553,125 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm1, (%r8) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx) +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FCP-NEXT: vmovq %xmm0, (%r8) ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride4_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512-NEXT: vpmovdb %ymm3, (%r8) +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdb %ymm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride4_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride4_vf8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride4_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: load_i8_stride4_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512BW-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512BW-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512BW-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512BW-NEXT: vpmovdb %ymm3, (%r8) +; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512BW-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdb %ymm0, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i8_stride4_vf8: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512BW-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512BW-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i8_stride4_vf8: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-BW-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-BW-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-BW-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-BW-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride4_vf8: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm0, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <32 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index ac14f55..5db006e5d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -583,133 +583,133 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movq %xmm8, (%rsi) +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pslld $24, %xmm7 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movq %xmm5, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: movq %xmm9, (%rcx) +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm8[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 ; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pslld $24, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7] +; SSE-NEXT: movq %xmm9, (%r8) +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm2, (%rdx) -; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm10, (%r8) -; SSE-NEXT: movq %xmm7, (%r9) +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movq %xmm5, (%r9) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride5_vf8: @@ -722,30 +722,30 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm3, (%r8) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) ; AVX-NEXT: vmovq %xmm0, (%r9) ; AVX-NEXT: retq ; @@ -758,26 +758,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-NEXT: vmovq %xmm3, (%rsi) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%rdx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vmovq %xmm3, (%r8) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm3, (%rsi) -; AVX2-NEXT: vmovq %xmm4, (%rdx) -; AVX2-NEXT: vmovq %xmm5, (%rcx) -; AVX2-NEXT: vmovq %xmm6, (%r8) ; AVX2-NEXT: vmovq %xmm0, (%r9) ; AVX2-NEXT: retq ; @@ -790,26 +790,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FP-NEXT: retq ; @@ -822,26 +822,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX2-FCP-NEXT: retq ; @@ -854,26 +854,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vmovq %xmm3, (%rsi) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%rdx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-NEXT: vmovq %xmm3, (%r8) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm3, (%rsi) -; AVX512-NEXT: vmovq %xmm4, (%rdx) -; AVX512-NEXT: vmovq %xmm5, (%rcx) -; AVX512-NEXT: vmovq %xmm6, (%r8) ; AVX512-NEXT: vmovq %xmm0, (%r9) ; AVX512-NEXT: retq ; @@ -886,26 +886,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512-FCP-NEXT: retq ; @@ -918,26 +918,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-NEXT: retq ; @@ -950,26 +950,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-FCP-NEXT: retq ; @@ -982,26 +982,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-NEXT: retq ; @@ -1014,26 +1014,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512BW-FCP-NEXT: retq ; @@ -1046,26 +1046,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-NEXT: retq ; @@ -1078,26 +1078,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <40 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index f87126a..763b8a6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -755,146 +755,146 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [16711935,16711935,16711935,16711935] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movq %xmm9, (%rsi) +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movq %xmm7, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: movq %xmm12, (%rcx) ; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm9[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,2] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movq %xmm10, (%r8) +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movq %xmm3, (%r9) +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: movq %xmm2, (%rsi) -; SSE-NEXT: movq %xmm1, (%rdx) -; SSE-NEXT: movq %xmm10, (%rcx) -; SSE-NEXT: movq %xmm12, (%r8) -; SSE-NEXT: movq %xmm3, (%r9) -; SSE-NEXT: movq %xmm9, (%rax) +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movq %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride6_vf8: @@ -910,42 +910,42 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX-NEXT: vmovq %xmm3, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%r8) +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vmovq %xmm3, (%r9) ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm3, (%rsi) -; AVX-NEXT: vmovq %xmm4, (%rdx) -; AVX-NEXT: vmovq %xmm5, (%rcx) -; AVX-NEXT: vmovq %xmm7, (%r8) -; AVX-NEXT: vmovq %xmm8, (%r9) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -959,30 +959,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vmovq %xmm4, (%rsi) ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r8) ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r9) ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm2, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm3, (%r8) -; AVX2-NEXT: vmovq %xmm5, (%r9) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -997,30 +997,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FP-NEXT: vmovq %xmm5, (%r9) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1035,30 +1035,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1073,30 +1073,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vmovq %xmm4, (%rsi) ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-NEXT: vmovq %xmm4, (%rcx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r8) ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r9) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm4, (%rsi) -; AVX512-NEXT: vmovq %xmm2, (%rdx) -; AVX512-NEXT: vmovq %xmm6, (%rcx) -; AVX512-NEXT: vmovq %xmm3, (%r8) -; AVX512-NEXT: vmovq %xmm5, (%r9) ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1111,30 +1111,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1149,30 +1149,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1187,30 +1187,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -1225,30 +1225,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vmovq %xmm4, (%rsi) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r8) ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm4, (%rsi) -; AVX512BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-NEXT: vmovq %xmm6, (%rcx) -; AVX512BW-NEXT: vmovq %xmm3, (%r8) -; AVX512BW-NEXT: vmovq %xmm5, (%r9) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1263,30 +1263,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1301,30 +1301,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1339,30 +1339,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 8248126..09d0079 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -932,106 +932,100 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm13, %xmm9 ; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm13[0],xmm8[1,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm5, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm13 ; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm13 -; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: por %xmm7, %xmm15 +; SSE-NEXT: movq %xmm15, (%rsi) ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm9, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movq %xmm9, (%rdx) ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 @@ -1040,107 +1034,104 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movq %xmm8, (%rcx) ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movq %xmm4, (%r8) ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3],xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] ; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%rcx) +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm11, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] @@ -1148,12 +1139,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: movq %xmm13, (%rsi) -; SSE-NEXT: movq %xmm9, (%rdx) -; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm10, (%r9) -; SSE-NEXT: movq %xmm11, (%rdi) ; SSE-NEXT: movq %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -1174,52 +1159,52 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0] ; AVX-NEXT: # xmm7 = mem[0,0] ; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3] -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm8, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX-NEXT: vpblendvb %xmm7, %xmm8, %xmm9, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX-NEXT: vpblendvb %xmm7, %xmm9, %xmm6, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r10) ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm8, (%rcx) -; AVX-NEXT: vmovq %xmm6, (%r8) -; AVX-NEXT: vmovq %xmm7, (%r9) -; AVX-NEXT: vmovq %xmm10, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -1235,45 +1220,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq %xmm2, (%rsi) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rdx) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r8) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r9) +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%r10) +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovq %xmm2, (%rsi) -; AVX2-NEXT: vmovq %xmm3, (%rdx) -; AVX2-NEXT: vmovq %xmm4, (%rcx) -; AVX2-NEXT: vmovq %xmm5, (%r8) -; AVX2-NEXT: vmovq %xmm6, (%r9) -; AVX2-NEXT: vmovq %xmm7, (%r10) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1290,45 +1275,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm5, (%r8) -; AVX2-FP-NEXT: vmovq %xmm6, (%r9) -; AVX2-FP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq @@ -1345,45 +1330,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] -; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -1400,44 +1385,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-NEXT: vmovq %xmm2, (%rsi) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rdx) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%rcx) +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r8) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r9) +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, (%r10) ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vmovq %xmm2, (%rsi) -; AVX512-NEXT: vmovq %xmm3, (%rdx) -; AVX512-NEXT: vmovq %xmm4, (%rcx) -; AVX512-NEXT: vmovq %xmm5, (%r8) -; AVX512-NEXT: vmovq %xmm6, (%r9) -; AVX512-NEXT: vmovq %xmm7, (%r10) ; AVX512-NEXT: vmovq %xmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1454,44 +1439,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovq %xmm2, (%r10) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1508,44 +1493,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovq %xmm2, (%r10) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1562,44 +1547,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r10) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -1617,48 +1602,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: movw $580, %di # imm = 0x244 -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-NEXT: movw $580, %dx # imm = 0x244 +; AVX512BW-NEXT: kmovd %edx, %k1 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r8) +; AVX512BW-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovq %xmm2, (%r10) +; AVX512BW-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vmovq %xmm7, (%r10) ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1676,48 +1661,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-FCP-NEXT: movw $580, %di # imm = 0x244 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512BW-FCP-NEXT: movw $580, %dx # imm = 0x244 +; AVX512BW-FCP-NEXT: kmovd %edx, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512BW-FCP-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX512BW-FCP-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1735,48 +1720,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: movw $580, %di # imm = 0x244 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-NEXT: movw $580, %dx # imm = 0x244 +; AVX512DQ-BW-NEXT: kmovd %edx, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-BW-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-BW-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1794,48 +1779,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: movw $580, %di # imm = 0x244 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) +; AVX512DQ-BW-FCP-NEXT: movw $580, %dx # imm = 0x244 +; AVX512DQ-BW-FCP-NEXT: kmovd %edx, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX512DQ-BW-FCP-NEXT: movw $4644, %cx # imm = 0x1224 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX512DQ-BW-FCP-NEXT: movw $9288, %cx # imm = 0x2448 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 6770fb6..deb74d2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -878,212 +878,205 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: movq %xmm0, (%rsi) +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: packuswb %xmm14, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,1,1,3] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movq %xmm0, (%rdx) +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm12, %xmm12 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movq %xmm4, (%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movq %xmm0, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq %xmm6, (%r9) +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm6, (%rax) +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq %xmm1, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,3,2,3] -; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlps %xmm0, (%rdx) -; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm15, (%r8) -; SSE-NEXT: movq %xmm11, (%r9) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm9, (%rax) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm4, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq %xmm3, (%rax) -; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride8_vf8: @@ -1104,76 +1097,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX-NEXT: vmovq %xmm4, (%rsi) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] -; AVX-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rdx) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%rcx) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r8) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r9) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r11) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vmovq %xmm4, (%r10) +; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovq %xmm4, (%rsi) -; AVX-NEXT: vmovq %xmm5, (%rdx) -; AVX-NEXT: vmovq %xmm6, (%rcx) -; AVX-NEXT: vmovq %xmm7, (%r8) -; AVX-NEXT: vmovq %xmm8, (%r9) -; AVX-NEXT: vmovq %xmm9, (%r11) -; AVX-NEXT: vmovq %xmm10, (%r10) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq ; @@ -1195,76 +1188,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rsi) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rdx) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%rcx) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r8) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r9) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r11) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-NEXT: vmovq %xmm4, (%r10) +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm4, (%rsi) -; AVX2-NEXT: vmovq %xmm5, (%rdx) -; AVX2-NEXT: vmovq %xmm6, (%rcx) -; AVX2-NEXT: vmovq %xmm7, (%r8) -; AVX2-NEXT: vmovq %xmm8, (%r9) -; AVX2-NEXT: vmovq %xmm9, (%r11) -; AVX2-NEXT: vmovq %xmm10, (%r10) ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; @@ -1286,76 +1279,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r8) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r9) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r11) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FP-NEXT: vmovq %xmm4, (%r10) +; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovq %xmm5, (%rdx) -; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FP-NEXT: vmovq %xmm7, (%r8) -; AVX2-FP-NEXT: vmovq %xmm8, (%r9) -; AVX2-FP-NEXT: vmovq %xmm9, (%r11) -; AVX2-FP-NEXT: vmovq %xmm10, (%r10) ; AVX2-FP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FP-NEXT: retq ; @@ -1364,54 +1357,54 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3] +; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi) ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7] -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm8, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm11, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm5, (%r11) -; AVX2-FCP-NEXT: vmovq %xmm6, (%r10) -; AVX2-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm7 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3] +; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx) +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm9 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm10 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] +; AVX2-FCP-NEXT: vmovq %xmm9, (%rcx) +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm11 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r8) +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r11) +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vmovq %xmm2, (%r10) +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -1421,21 +1414,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512-NEXT: vpmovqb %zmm3, (%r8) -; AVX512-NEXT: vpmovqb %zmm4, (%r9) -; AVX512-NEXT: vpmovqb %zmm5, (%r11) -; AVX512-NEXT: vpmovqb %zmm6, (%r10) -; AVX512-NEXT: vpmovqb %zmm7, (%rax) +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r8) +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r9) +; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r11) +; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512-NEXT: vpmovqb %zmm1, (%r10) +; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1445,21 +1438,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1469,21 +1462,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1493,21 +1486,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -1517,21 +1510,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512BW-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512BW-NEXT: vpmovqb %zmm3, (%r8) -; AVX512BW-NEXT: vpmovqb %zmm4, (%r9) -; AVX512BW-NEXT: vpmovqb %zmm5, (%r11) -; AVX512BW-NEXT: vpmovqb %zmm6, (%r10) -; AVX512BW-NEXT: vpmovqb %zmm7, (%rax) +; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512BW-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r8) +; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r9) +; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r11) +; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, (%r10) +; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqb %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1541,21 +1534,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512BW-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512BW-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512BW-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512BW-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512BW-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512BW-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512BW-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1565,21 +1558,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-BW-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-BW-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-BW-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-BW-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-BW-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-BW-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-BW-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1589,21 +1582,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm3, (%r8) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, (%r9) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm5, (%r11) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, (%r10) -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r8) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r11) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r10) +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <64 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index d0bb90c..552b927 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -250,7 +250,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,u,2,u,4,u,8,u,1,u,2,u,4,u,8,u] ; SSE2-NEXT: pmullw %xmm2, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm1 @@ -265,7 +265,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8] ; X86-SSE4-NEXT: psllw $8, %xmm1 -; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0] +; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8] ; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE4-NEXT: por %xmm1, %xmm0 ; X86-SSE4-NEXT: retl @@ -275,7 +275,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8] ; X64-SSE4-NEXT: psllw $8, %xmm1 -; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0] +; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8] ; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE4-NEXT: por %xmm1, %xmm0 ; X64-SSE4-NEXT: retq @@ -1058,11 +1058,11 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3,9,17,33,65,129,2,3] +; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3,u,9,u,17,u,33,u,65,u,129,u,2,u,3,u] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2] +; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,u,3,u,9,u,17,u,33,u,65,u,129,u,2,u] ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -1072,7 +1072,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] ; X86-SSE4-NEXT: psllw $8, %xmm1 -; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] +; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] ; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE4-NEXT: por %xmm1, %xmm0 ; X86-SSE4-NEXT: retl @@ -1081,11 +1081,11 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3,9,17,33,65,129,2,3] +; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3,u,9,u,17,u,33,u,65,u,129,u,2,u,3,u] ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2] +; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,u,3,u,9,u,17,u,33,u,65,u,129,u,2,u] ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq @@ -1095,7 +1095,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] ; X64-SSE4-NEXT: psllw $8, %xmm1 -; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] +; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] ; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE4-NEXT: por %xmm1, %xmm0 ; X64-SSE4-NEXT: retq @@ -1103,7 +1103,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> ; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: ; X64-XOP: # %bb.0: ; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3] -; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0] +; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14] ; X64-XOP-NEXT: retq ; @@ -1832,7 +1832,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,u,1,u,3,u,7,u,15,u,31,u,63,u,127,u] ; SSE2-NEXT: pmullw %xmm2, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm1 @@ -1847,7 +1847,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] ; X86-SSE4-NEXT: psllw $8, %xmm1 -; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] +; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] ; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE4-NEXT: por %xmm1, %xmm0 ; X86-SSE4-NEXT: retl @@ -1857,7 +1857,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] ; X64-SSE4-NEXT: psllw $8, %xmm1 -; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] +; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] ; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE4-NEXT: por %xmm1, %xmm0 ; X64-SSE4-NEXT: retq @@ -1865,7 +1865,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> ; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: ; X64-XOP: # %bb.0: ; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127] -; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0] +; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] ; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14] ; X64-XOP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll index b233855..324fe12 100644 --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -85,14 +85,14 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) { ; CHECK-NEXT: movswl %dx, %edx ; CHECK-NEXT: leal (,%rdx,4), %esi ; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: shldw $1, %di, %si ; CHECK-NEXT: sarl $14, %edx ; CHECK-NEXT: cmpl $16384, %edx # imm = 0x4000 -; CHECK-NEXT: cmovgel %eax, %edi +; CHECK-NEXT: cmovgel %eax, %esi ; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 -; CHECK-NEXT: cmovll %ecx, %edi -; CHECK-NEXT: pinsrw $3, %edi, %xmm1 +; CHECK-NEXT: cmovll %ecx, %esi +; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15) @@ -106,19 +106,19 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $2, %xmm0, %eax ; CHECK-NEXT: leal (%rax,%rax,2), %eax ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shldw $1, %ax, %cx -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shldw $1, %dx, %cx +; CHECK-NEXT: cmpl $32768, %eax # imm = 0x8000 ; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF ; CHECK-NEXT: cmovael %eax, %ecx ; CHECK-NEXT: pextrw $1, %xmm0, %edx ; CHECK-NEXT: addl %edx, %edx ; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shldw $1, %dx, %di -; CHECK-NEXT: cmpl $32768, %esi # imm = 0x8000 +; CHECK-NEXT: shrl $16, %edx +; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %edi ; CHECK-NEXT: movd %xmm0, %edx ; CHECK-NEXT: xorl %esi, %esi @@ -133,10 +133,10 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $3, %xmm0, %ecx ; CHECK-NEXT: shll $2, %ecx ; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shldw $1, %cx, %si -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl %ecx, %esi +; CHECK-NEXT: shldw $1, %dx, %si +; CHECK-NEXT: cmpl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %esi ; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 9816fa7..044327d 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -875,28 +875,12 @@ define i1 @mask_v8i32(<8 x i32> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: mask_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: mask_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: mask_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: mask_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vtestps %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = and i32 %1, 2147483648 %3 = icmp eq i32 %2, 0 @@ -965,28 +949,12 @@ define i1 @signtest_v8i32(<8 x i32> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: signtest_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: signtest_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: signtest_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: signtest_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vtestps %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = icmp sgt i32 %1, -1 ret i1 %2 @@ -1010,28 +978,12 @@ define i1 @signtest_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: signtest_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: signtest_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: signtest_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: signtest_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vtestpd %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) %2 = icmp sgt i64 %1, -1 ret i1 %2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 320dce8..6cb4323 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -397,8 +397,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -409,8 +409,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -421,8 +421,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vpmovw2m %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -722,8 +722,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -734,8 +734,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -746,8 +746,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -974,13 +974,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -990,13 +990,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovb2m %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1211,8 +1211,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1222,8 +1222,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, zeroinitializer @@ -1427,8 +1427,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1439,8 +1439,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1450,8 +1450,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vptestnmw %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1756,8 +1756,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1767,8 +1767,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1778,8 +1778,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2010,13 +2010,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2025,13 +2025,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2240,8 +2240,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2251,8 +2251,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512VL-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, %1 @@ -2504,8 +2504,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2517,8 +2517,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2528,8 +2528,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2845,8 +2845,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2856,8 +2856,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2867,8 +2867,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512VL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -3097,13 +3097,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3112,13 +3112,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index a768baa..466fa6b 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -5890,17 +5890,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: movw $255, %ax -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} ; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: kxnorb %k0, %k0, %k2 +; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -5910,17 +5909,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: movw $255, %ax -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: kxnorb %k0, %k0, %k2 +; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 0fb0420..aff2228 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index c5d3297..7c1a531 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1931,31 +1931,28 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v8i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1977,7 +1974,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2003,14 +2001,12 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v8i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> ret <8 x i8> %shift @@ -2019,31 +2015,28 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v4i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2065,7 +2058,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2091,14 +2084,12 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v4i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> ret <4 x i8> %shift @@ -2107,31 +2098,28 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE-LABEL: constant_shift_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] ; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2153,7 +2141,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -2179,14 +2167,12 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; X86-SSE-LABEL: constant_shift_v2i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = ashr <2 x i8> %a, <i8 2, i8 3> ret <2 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 103d570..4450d07 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index eb39b6a..e6eb4d7 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1617,39 +1617,34 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1671,7 +1666,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1698,12 +1694,10 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v8i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> ret <8 x i8> %shift @@ -1713,39 +1707,34 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,256,256,256,256] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1767,7 +1756,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1794,12 +1783,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v4i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3> ret <4 x i8> %shift @@ -1809,39 +1796,34 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u] +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u] ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1863,7 +1845,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1890,12 +1872,10 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v2i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u] ; X86-SSE-NEXT: psrlw $8, %xmm0 -; X86-SSE-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl %shift = lshr <2 x i8> %a, <i8 2, i8 3> ret <2 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 3085c32..efe80b4 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -1151,11 +1151,11 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,64,32,16,8,4,2,1] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,u,64,u,32,u,16,u,8,u,4,u,2,u,1,u] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1165,7 +1165,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1174,7 +1174,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -1232,11 +1232,11 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [128,64,32,16,8,4,2,1] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [128,u,64,u,32,u,16,u,8,u,4,u,2,u,1,u] ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; X86-SSE-NEXT: pand %xmm2, %xmm1 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u] ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index f9ccd1e..c7d2532 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1313,9 +1313,9 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] -; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1325,7 +1325,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1352,7 +1352,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX512DQ-NEXT: vpsllw $8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq @@ -1366,7 +1366,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; ; AVX512DQVL-LABEL: constant_shift_v32i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX512DQVL-NEXT: vpsllw $8, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst) @@ -1388,9 +1388,9 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1 ; X86-AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2] -; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] +; X86-AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3 +; X86-AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1400,7 +1400,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; X86-AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 -; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index efd7429..1e5f1b8 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -312,10 +307,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm3 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] @@ -329,7 +324,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1] ; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst) diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index d245bdc..07e6c36 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1429,7 +1429,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v8i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: packuswb %xmm1, %xmm0 @@ -1438,7 +1438,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE41-LABEL: constant_shift_v8i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm0 @@ -1447,7 +1447,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1478,7 +1478,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1505,7 +1506,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v8i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u] ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: packuswb %xmm1, %xmm0 @@ -1518,7 +1519,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,u,u,u,u] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,u,u,u,u,u,u,u,u] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: packuswb %xmm1, %xmm0 @@ -1527,7 +1528,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE41-LABEL: constant_shift_v4i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,u,u,u,u] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm0 @@ -1536,7 +1537,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v4i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,u,u,u,u] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,2,u,4,u,8,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1567,7 +1568,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1594,7 +1595,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v4i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,u,u,u,u] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,u,2,u,4,u,8,u,u,u,u,u,u,u,u,u] ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: packuswb %xmm1, %xmm0 @@ -1607,7 +1608,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,u,u,u,u,u,u] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,u,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: packuswb %xmm1, %xmm0 @@ -1616,7 +1617,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE41-LABEL: constant_shift_v2i8: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,u,u,u,u,u,u] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,u,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm0 @@ -1625,7 +1626,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,8,u,u,u,u,u,u] +; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,u,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1656,7 +1657,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_shift_v2i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 @@ -1683,7 +1684,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v2i8: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,8,u,u,u,u,u,u] +; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,u,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: packuswb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll index c33776d..b79d9e8c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -22,6 +22,9 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>) +declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>) + define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) { ; CHECK-LABEL: combine_permvar_8f64_identity: ; CHECK: # %bb.0: @@ -1031,3 +1034,24 @@ define <8 x double> @concat_vpermilvar_v8f64_v4f64(<4 x double> %a0, <4 x double %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %res } + +; shift elements up by one +define <16 x i32> @combine_vexpandd_as_valignd(<16 x i32> %x) { +; CHECK-LABEL: combine_vexpandd_as_valignd: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: valignd {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %x, <16 x i32> zeroinitializer, <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) + ret <16 x i32> %res +} + +; zero upper half of vector +define <16 x i32> @combine_vcompressd_as_vmov(<16 x i32> %x) { +; CHECK-LABEL: combine_vcompressd_as_vmov: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %x, <16 x i32> zeroinitializer, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>) + ret <16 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index 3590c4d..ac58306 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -100,16 +100,14 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-NEXT: pshufb %xmm3, %xmm4 ; SSE-NEXT: pshufb %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pmaddubsw %xmm3, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pmaddubsw %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmullw %xmm1, %xmm2 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pmaddubsw %xmm3, %xmm0 ; SSE-NEXT: psllw $8, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: PR50049: @@ -129,21 +127,20 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR50049: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index ee9d8a5..35e1c5a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3575,21 +3575,17 @@ define void @SpinningCube() { ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE2-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE2-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] -; SSE2-NEXT: addps %xmm0, %xmm3 -; SSE2-NEXT: movaps %xmm3, (%rax) -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: addps %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm0, (%rax) +; SSE2-NEXT: movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE2-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, (%rax) +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: addps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, (%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: SpinningCube: @@ -3598,54 +3594,43 @@ define void @SpinningCube() { ; SSSE3-NEXT: xorps %xmm0, %xmm0 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSSE3-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] -; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSSE3-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] -; SSSE3-NEXT: addps %xmm0, %xmm3 -; SSSE3-NEXT: movaps %xmm3, (%rax) -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: addps %xmm2, %xmm0 -; SSSE3-NEXT: movaps %xmm0, (%rax) +; SSSE3-NEXT: movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSSE3-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] +; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: addps %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, (%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,mem[0] -; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] -; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] -; SSE41-NEXT: addps %xmm3, %xmm4 -; SSE41-NEXT: movaps %xmm4, (%rax) -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] -; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm2, (%rax) +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; SSE41-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 +; SSE41-NEXT: movaps %xmm1, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] +; SSE41-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: addps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, (%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0] -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] -; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovaps %xmm2, (%rax) -; AVX-NEXT: vbroadcastss (%rax), %xmm2 -; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vmovaps %xmm1, (%rax) +; AVX-NEXT: vbroadcastss (%rax), %xmm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovaps %xmm0, (%rax) ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/issue163738.ll b/llvm/test/CodeGen/X86/vpternlog.ll index 61fe043..bd7478d 100644 --- a/llvm/test/CodeGen/X86/issue163738.ll +++ b/llvm/test/CodeGen/X86/vpternlog.ll @@ -11,3 +11,15 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) { %and3 = xor <8 x i64> %and3.demorgan, splat (i64 -1) ret <8 x i64> %and3 } + +define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) { +; CHECK-LABEL: xorbitcast: +; CHECK: # %bb.0: +; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1) +; CHECK-NEXT: retq + %or1 = or <64 x i8> %a, %b + %or2 = or <64 x i8> %or1, %c + %cast = bitcast <64 x i8> %or2 to <8 x i64> + %xor = xor <8 x i64> %cast, splat (i64 -1) + ret <8 x i64> %xor +} diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 3c98eba6..65b6028 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1,36 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK0 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK1 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK2 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK3 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK4 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK5 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK6 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK7 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK8 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK9 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK10 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK11 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK12 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK13 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK14 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK15 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK16 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK17 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK18 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK19 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK20 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK21 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK22 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK23 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK24 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK25 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK26 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK27 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK28 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK29 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK30 -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK31 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX512 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE4 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE4 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE4 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-SSE4 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX512 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX512 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX512 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX512 define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: lshr_4bytes: @@ -646,784 +646,596 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; -; FALLBACK16-LABEL: lshr_16bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $60, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %edx -; FALLBACK16-NEXT: movl 4(%ecx), %esi -; FALLBACK16-NEXT: movl 8(%ecx), %edi -; FALLBACK16-NEXT: movl 12(%ecx), %ecx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movb %ah, %al -; FALLBACK16-NEXT: shlb $3, %al -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $12, %ah -; FALLBACK16-NEXT: movzbl %ah, %ebp -; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %ebx -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %esi, %esi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl %ebx, 12(%edx) -; FALLBACK16-NEXT: movl %ebp, 8(%edx) -; FALLBACK16-NEXT: movl %esi, (%edx) -; FALLBACK16-NEXT: movl %edi, 4(%edx) -; FALLBACK16-NEXT: addl $60, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: lshr_16bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $44, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK17-NEXT: movl (%edx), %esi -; FALLBACK17-NEXT: movl 4(%edx), %edi -; FALLBACK17-NEXT: movl 8(%edx), %ebx -; FALLBACK17-NEXT: movl 12(%edx), %edx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movb %ch, %cl -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, (%esp) -; FALLBACK17-NEXT: andb $12, %ch -; FALLBACK17-NEXT: movzbl %ch, %ebx -; FALLBACK17-NEXT: movl 8(%esp,%ebx), %esi -; FALLBACK17-NEXT: movl (%esp,%ebx), %edx -; FALLBACK17-NEXT: movl 4(%esp,%ebx), %ebp -; FALLBACK17-NEXT: movl %ebp, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl 12(%esp,%ebx), %ebx -; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx -; FALLBACK17-NEXT: shrl %cl, %ebx -; FALLBACK17-NEXT: movl %esi, 8(%eax) -; FALLBACK17-NEXT: movl %ebx, 12(%eax) -; FALLBACK17-NEXT: movl %edx, (%eax) -; FALLBACK17-NEXT: movl %edi, 4(%eax) -; FALLBACK17-NEXT: addl $44, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: lshr_16bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $44, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl (%ecx), %edx -; FALLBACK18-NEXT: movl 4(%ecx), %esi -; FALLBACK18-NEXT: movl 8(%ecx), %edi -; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %ebx -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, (%esp) -; FALLBACK18-NEXT: andb $12, %bl -; FALLBACK18-NEXT: movzbl %bl, %esi -; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi -; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx -; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: orl %ebp, %ecx -; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %ebp, %edi -; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx -; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi -; FALLBACK18-NEXT: shrxl %eax, %esi, %eax -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %edx -; FALLBACK18-NEXT: orl %ebx, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl %eax, 12(%esi) -; FALLBACK18-NEXT: movl %edx, 8(%esi) -; FALLBACK18-NEXT: movl %edi, (%esi) -; FALLBACK18-NEXT: movl %ecx, 4(%esi) -; FALLBACK18-NEXT: addl $44, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: lshr_16bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $44, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK19-NEXT: movl (%edx), %esi -; FALLBACK19-NEXT: movl 4(%edx), %edi -; FALLBACK19-NEXT: movl 8(%edx), %ebx -; FALLBACK19-NEXT: movl 12(%edx), %edx -; FALLBACK19-NEXT: movzbl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, (%esp) -; FALLBACK19-NEXT: andb $12, %al -; FALLBACK19-NEXT: movzbl %al, %eax -; FALLBACK19-NEXT: movl 8(%esp,%eax), %ebx -; FALLBACK19-NEXT: movl (%esp,%eax), %edx -; FALLBACK19-NEXT: movl 4(%esp,%eax), %esi -; FALLBACK19-NEXT: movl %esi, %edi -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi -; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax -; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK19-NEXT: movl %ebx, 8(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) -; FALLBACK19-NEXT: movl %edi, 4(%ebp) -; FALLBACK19-NEXT: addl $44, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: lshr_16bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $60, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movl %ecx, %eax -; FALLBACK20-NEXT: shlb $3, %al -; FALLBACK20-NEXT: xorps %xmm1, %xmm1 -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $12, %cl -; FALLBACK20-NEXT: movzbl %cl, %edi -; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %esi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl %edi, 12(%edx) -; FALLBACK20-NEXT: movl %ebx, 4(%edx) -; FALLBACK20-NEXT: movl %ebp, 8(%edx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, (%edx) -; FALLBACK20-NEXT: addl $60, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: lshr_16bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $44, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movups (%edx), %xmm0 -; FALLBACK21-NEXT: movzbl (%ecx), %edx -; FALLBACK21-NEXT: movl %edx, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: xorps %xmm1, %xmm1 -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, (%esp) -; FALLBACK21-NEXT: andb $12, %dl -; FALLBACK21-NEXT: movzbl %dl, %ebx -; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK21-NEXT: movl %ebp, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: movl (%esp,%ebx), %esi -; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK21-NEXT: movl %eax, %ebx -; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %ebx, 4(%ebp) -; FALLBACK21-NEXT: movl %edi, 8(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: shrl %cl, %edx -; FALLBACK21-NEXT: movl %edx, 12(%ebp) -; FALLBACK21-NEXT: movl %esi, (%ebp) -; FALLBACK21-NEXT: addl $44, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: lshr_16bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $44, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: xorps %xmm1, %xmm1 -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, (%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: notb %cl -; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK22-NEXT: orl %ebx, %edx -; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebx, %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: orl %ebp, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, 12(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %edi, 8(%esi) -; FALLBACK22-NEXT: movl %edx, (%esi) -; FALLBACK22-NEXT: addl $44, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: lshr_16bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $44, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movups (%edx), %xmm0 -; FALLBACK23-NEXT: movzbl (%ecx), %edx -; FALLBACK23-NEXT: movl %edx, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: xorps %xmm1, %xmm1 -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, (%esp) -; FALLBACK23-NEXT: andb $12, %dl -; FALLBACK23-NEXT: movzbl %dl, %ebx -; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl %ebp, %edi -; FALLBACK23-NEXT: shrdl %cl, %edx, %edi -; FALLBACK23-NEXT: movl (%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %ebx -; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK23-NEXT: movl %ebx, 4(%ebp) -; FALLBACK23-NEXT: movl %edi, 8(%ebp) -; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK23-NEXT: movl %edx, 12(%ebp) -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl %esi, (%ebp) -; FALLBACK23-NEXT: addl $44, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: lshr_16bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $60, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movl %ecx, %eax -; FALLBACK24-NEXT: shlb $3, %al -; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $12, %cl -; FALLBACK24-NEXT: movzbl %cl, %edi -; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %esi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl %edi, 12(%edx) -; FALLBACK24-NEXT: movl %ebx, 4(%edx) -; FALLBACK24-NEXT: movl %ebp, 8(%edx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, (%edx) -; FALLBACK24-NEXT: addl $60, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: lshr_16bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $44, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: vmovups (%edx), %xmm0 -; FALLBACK25-NEXT: movzbl (%ecx), %edx -; FALLBACK25-NEXT: movl %edx, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK25-NEXT: andb $12, %dl -; FALLBACK25-NEXT: movzbl %dl, %ebx -; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK25-NEXT: movl %ebp, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: movl (%esp,%ebx), %esi -; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK25-NEXT: movl %eax, %ebx -; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %ebx, 4(%ebp) -; FALLBACK25-NEXT: movl %edi, 8(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: shrl %cl, %edx -; FALLBACK25-NEXT: movl %edx, 12(%ebp) -; FALLBACK25-NEXT: movl %esi, (%ebp) -; FALLBACK25-NEXT: addl $44, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: lshr_16bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $44, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: notb %cl -; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK26-NEXT: orl %ebx, %edx -; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebx, %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: orl %ebp, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, 12(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %edi, 8(%esi) -; FALLBACK26-NEXT: movl %edx, (%esi) -; FALLBACK26-NEXT: addl $44, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: lshr_16bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $44, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: vmovups (%edx), %xmm0 -; FALLBACK27-NEXT: movzbl (%ecx), %edx -; FALLBACK27-NEXT: movl %edx, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK27-NEXT: andb $12, %dl -; FALLBACK27-NEXT: movzbl %dl, %ebx -; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl %ebp, %edi -; FALLBACK27-NEXT: shrdl %cl, %edx, %edi -; FALLBACK27-NEXT: movl (%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %ebx -; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK27-NEXT: movl %ebx, 4(%ebp) -; FALLBACK27-NEXT: movl %edi, 8(%ebp) -; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK27-NEXT: movl %edx, 12(%ebp) -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl %esi, (%ebp) -; FALLBACK27-NEXT: addl $44, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: lshr_16bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $60, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movl %ecx, %eax -; FALLBACK28-NEXT: shlb $3, %al -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $12, %cl -; FALLBACK28-NEXT: movzbl %cl, %edi -; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %esi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl %edi, 12(%edx) -; FALLBACK28-NEXT: movl %ebx, 4(%edx) -; FALLBACK28-NEXT: movl %ebp, 8(%edx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, (%edx) -; FALLBACK28-NEXT: addl $60, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: lshr_16bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $44, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: vmovups (%edx), %xmm0 -; FALLBACK29-NEXT: movzbl (%ecx), %edx -; FALLBACK29-NEXT: movl %edx, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK29-NEXT: andb $12, %dl -; FALLBACK29-NEXT: movzbl %dl, %ebx -; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK29-NEXT: movl %ebp, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: movl (%esp,%ebx), %esi -; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK29-NEXT: movl %eax, %ebx -; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %ebx, 4(%ebp) -; FALLBACK29-NEXT: movl %edi, 8(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: shrl %cl, %edx -; FALLBACK29-NEXT: movl %edx, 12(%ebp) -; FALLBACK29-NEXT: movl %esi, (%ebp) -; FALLBACK29-NEXT: addl $44, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: lshr_16bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $44, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: notb %cl -; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK30-NEXT: orl %ebx, %edx -; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebx, %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: orl %ebp, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, 12(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %edi, 8(%esi) -; FALLBACK30-NEXT: movl %edx, (%esi) -; FALLBACK30-NEXT: addl $44, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: lshr_16bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $44, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: vmovups (%edx), %xmm0 -; FALLBACK31-NEXT: movzbl (%ecx), %edx -; FALLBACK31-NEXT: movl %edx, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK31-NEXT: andb $12, %dl -; FALLBACK31-NEXT: movzbl %dl, %ebx -; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl %ebp, %edi -; FALLBACK31-NEXT: shrdl %cl, %edx, %edi -; FALLBACK31-NEXT: movl (%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %ebx -; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK31-NEXT: movl %ebx, 4(%ebp) -; FALLBACK31-NEXT: movl %edi, 8(%ebp) -; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx -; FALLBACK31-NEXT: movl %edx, 12(%ebp) -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl %esi, (%ebp) -; FALLBACK31-NEXT: addl $44, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: retl +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_16bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %al +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ah, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 12(%edx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%edx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%edx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ch, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%esp,%ebx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp,%ebx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%esp,%ebx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_16bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esp,%edi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esp,%edi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %al, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esp,%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esp,%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_16bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 12(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%edx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %dl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %dl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%esp,%ebx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%esp,%ebx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp,%ebx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_16bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $60, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $60, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%edx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_16bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $12, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 20(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 24(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 12(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, (%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $12, %dl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %dl, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 12(%esp,%ebx), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 8(%esp,%ebx), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp,%ebx), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_16bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $60, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %ebx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $60, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 12(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 8(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 %bitOff = shl i128 %byteOff, 3 @@ -1661,791 +1473,599 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; -; FALLBACK16-LABEL: shl_16bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $60, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %ebx -; FALLBACK16-NEXT: movl 4(%ecx), %esi -; FALLBACK16-NEXT: movl 8(%ecx), %edi -; FALLBACK16-NEXT: movl 12(%ecx), %ecx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movb %ah, %dh -; FALLBACK16-NEXT: shlb $3, %dh -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $12, %ah -; FALLBACK16-NEXT: negb %ah -; FALLBACK16-NEXT: movsbl %ah, %ebp -; FALLBACK16-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movb %dh, %dl -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: shrl %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 40(%esp,%ebp), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %edi, %esi -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 8(%eax) -; FALLBACK16-NEXT: movl %ebp, 12(%eax) -; FALLBACK16-NEXT: movl %ebx, 4(%eax) -; FALLBACK16-NEXT: addl $60, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: shl_16bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $32, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK17-NEXT: movl (%edx), %esi -; FALLBACK17-NEXT: movl 4(%edx), %edi -; FALLBACK17-NEXT: movl 8(%edx), %ebx -; FALLBACK17-NEXT: movl 12(%edx), %edx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movb %ch, %cl -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, (%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $12, %ch -; FALLBACK17-NEXT: negb %ch -; FALLBACK17-NEXT: movsbl %ch, %edi -; FALLBACK17-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK17-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK17-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK17-NEXT: shldl %cl, %edi, %esi -; FALLBACK17-NEXT: shldl %cl, %ebx, %edi -; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: movl %esi, 8(%eax) -; FALLBACK17-NEXT: movl %edx, 12(%eax) -; FALLBACK17-NEXT: movl %ebx, (%eax) -; FALLBACK17-NEXT: movl %edi, 4(%eax) -; FALLBACK17-NEXT: addl $32, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: shl_16bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $44, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl (%ecx), %edx -; FALLBACK18-NEXT: movl 4(%ecx), %esi -; FALLBACK18-NEXT: movl 8(%ecx), %edi -; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %eax -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: shlb $3, %bl -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, (%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $12, %al -; FALLBACK18-NEXT: negb %al -; FALLBACK18-NEXT: movsbl %al, %edx -; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi -; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: notb %al -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %eax, %edi, %edi -; FALLBACK18-NEXT: orl %esi, %edi -; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx -; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx -; FALLBACK18-NEXT: shrl %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %edx -; FALLBACK18-NEXT: orl %esi, %edx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl %ebp, (%ecx) -; FALLBACK18-NEXT: movl %eax, 8(%ecx) -; FALLBACK18-NEXT: movl %edx, 12(%ecx) -; FALLBACK18-NEXT: movl %edi, 4(%ecx) -; FALLBACK18-NEXT: addl $44, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: shl_16bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $44, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK19-NEXT: movl (%edx), %esi -; FALLBACK19-NEXT: movl 4(%edx), %edi -; FALLBACK19-NEXT: movl 8(%edx), %ebx -; FALLBACK19-NEXT: movl 12(%edx), %edx -; FALLBACK19-NEXT: movzbl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, (%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $12, %al -; FALLBACK19-NEXT: negb %al -; FALLBACK19-NEXT: movsbl %al, %eax -; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi -; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx -; FALLBACK19-NEXT: shldl %cl, %esi, %edx -; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi -; FALLBACK19-NEXT: movl 20(%esp,%eax), %eax -; FALLBACK19-NEXT: shldl %cl, %eax, %esi -; FALLBACK19-NEXT: shldl %cl, %edi, %eax -; FALLBACK19-NEXT: shlxl %ecx, %edi, %ecx -; FALLBACK19-NEXT: movl %esi, 8(%ebp) -; FALLBACK19-NEXT: movl %edx, 12(%ebp) -; FALLBACK19-NEXT: movl %ecx, (%ebp) -; FALLBACK19-NEXT: movl %eax, 4(%ebp) -; FALLBACK19-NEXT: addl $44, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: shl_16bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $60, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movl %ecx, %eax -; FALLBACK20-NEXT: shlb $3, %al -; FALLBACK20-NEXT: xorps %xmm1, %xmm1 -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $12, %cl -; FALLBACK20-NEXT: negb %cl -; FALLBACK20-NEXT: movsbl %cl, %edi -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi -; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebp, %edi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: movl %eax, (%edx) -; FALLBACK20-NEXT: movl %ebp, 4(%edx) -; FALLBACK20-NEXT: movl %edi, 8(%edx) -; FALLBACK20-NEXT: movl %esi, 12(%edx) -; FALLBACK20-NEXT: addl $60, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: shl_16bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $44, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movups (%edx), %xmm0 -; FALLBACK21-NEXT: movzbl (%ecx), %edx -; FALLBACK21-NEXT: movl %edx, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: xorps %xmm1, %xmm1 -; FALLBACK21-NEXT: movaps %xmm1, (%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $12, %dl -; FALLBACK21-NEXT: negb %dl -; FALLBACK21-NEXT: movsbl %dl, %edi -; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK21-NEXT: shldl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %ebx, %ebp -; FALLBACK21-NEXT: shll %cl, %ebp -; FALLBACK21-NEXT: shldl %cl, %ebx, %edi -; FALLBACK21-NEXT: movl %edi, 4(%eax) -; FALLBACK21-NEXT: movl %esi, 8(%eax) -; FALLBACK21-NEXT: movl %edx, 12(%eax) -; FALLBACK21-NEXT: movl %ebp, (%eax) -; FALLBACK21-NEXT: addl $44, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: shl_16bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $44, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: xorps %xmm1, %xmm1 -; FALLBACK22-NEXT: movaps %xmm1, (%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %ecx -; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %esi, %edx -; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %esi, %ebp -; FALLBACK22-NEXT: shrl %ebp -; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, (%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) -; FALLBACK22-NEXT: movl %edx, 12(%esi) -; FALLBACK22-NEXT: addl $44, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: shl_16bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $44, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movups (%edx), %xmm0 -; FALLBACK23-NEXT: movzbl (%ecx), %edx -; FALLBACK23-NEXT: movl %edx, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: xorps %xmm1, %xmm1 -; FALLBACK23-NEXT: movaps %xmm1, (%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $12, %dl -; FALLBACK23-NEXT: negb %dl -; FALLBACK23-NEXT: movsbl %dl, %edi -; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK23-NEXT: shldl %cl, %esi, %edx -; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK23-NEXT: shldl %cl, %edi, %esi -; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: shldl %cl, %ebx, %edi -; FALLBACK23-NEXT: movl %edi, 4(%eax) -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl %edx, 12(%eax) -; FALLBACK23-NEXT: movl %ebp, (%eax) -; FALLBACK23-NEXT: addl $44, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: shl_16bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $60, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movl %ecx, %eax -; FALLBACK24-NEXT: shlb $3, %al -; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $12, %cl -; FALLBACK24-NEXT: negb %cl -; FALLBACK24-NEXT: movsbl %cl, %edi -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi -; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebp, %edi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: movl %eax, (%edx) -; FALLBACK24-NEXT: movl %ebp, 4(%edx) -; FALLBACK24-NEXT: movl %edi, 8(%edx) -; FALLBACK24-NEXT: movl %esi, 12(%edx) -; FALLBACK24-NEXT: addl $60, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: shl_16bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $44, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: vmovups (%edx), %xmm0 -; FALLBACK25-NEXT: movzbl (%ecx), %edx -; FALLBACK25-NEXT: movl %edx, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK25-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $12, %dl -; FALLBACK25-NEXT: negb %dl -; FALLBACK25-NEXT: movsbl %dl, %edi -; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK25-NEXT: shldl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %ebx, %ebp -; FALLBACK25-NEXT: shll %cl, %ebp -; FALLBACK25-NEXT: shldl %cl, %ebx, %edi -; FALLBACK25-NEXT: movl %edi, 4(%eax) -; FALLBACK25-NEXT: movl %esi, 8(%eax) -; FALLBACK25-NEXT: movl %edx, 12(%eax) -; FALLBACK25-NEXT: movl %ebp, (%eax) -; FALLBACK25-NEXT: addl $44, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: shl_16bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $44, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK26-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %ecx -; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %edi -; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %esi, %edx -; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %esi, %ebp -; FALLBACK26-NEXT: shrl %ebp -; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, (%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) -; FALLBACK26-NEXT: movl %edx, 12(%esi) -; FALLBACK26-NEXT: addl $44, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: shl_16bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $44, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: vmovups (%edx), %xmm0 -; FALLBACK27-NEXT: movzbl (%ecx), %edx -; FALLBACK27-NEXT: movl %edx, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK27-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $12, %dl -; FALLBACK27-NEXT: negb %dl -; FALLBACK27-NEXT: movsbl %dl, %edi -; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK27-NEXT: shldl %cl, %esi, %edx -; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK27-NEXT: shldl %cl, %edi, %esi -; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: shldl %cl, %ebx, %edi -; FALLBACK27-NEXT: movl %edi, 4(%eax) -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl %edx, 12(%eax) -; FALLBACK27-NEXT: movl %ebp, (%eax) -; FALLBACK27-NEXT: addl $44, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: shl_16bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $60, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movl %ecx, %eax -; FALLBACK28-NEXT: shlb $3, %al -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $12, %cl -; FALLBACK28-NEXT: negb %cl -; FALLBACK28-NEXT: movsbl %cl, %edi -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi -; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebp, %edi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: movl %eax, (%edx) -; FALLBACK28-NEXT: movl %ebp, 4(%edx) -; FALLBACK28-NEXT: movl %edi, 8(%edx) -; FALLBACK28-NEXT: movl %esi, 12(%edx) -; FALLBACK28-NEXT: addl $60, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: shl_16bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $44, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: vmovups (%edx), %xmm0 -; FALLBACK29-NEXT: movzbl (%ecx), %edx -; FALLBACK29-NEXT: movl %edx, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $12, %dl -; FALLBACK29-NEXT: negb %dl -; FALLBACK29-NEXT: movsbl %dl, %edi -; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK29-NEXT: shldl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %ebx, %ebp -; FALLBACK29-NEXT: shll %cl, %ebp -; FALLBACK29-NEXT: shldl %cl, %ebx, %edi -; FALLBACK29-NEXT: movl %edi, 4(%eax) -; FALLBACK29-NEXT: movl %esi, 8(%eax) -; FALLBACK29-NEXT: movl %edx, 12(%eax) -; FALLBACK29-NEXT: movl %ebp, (%eax) -; FALLBACK29-NEXT: addl $44, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: shl_16bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $44, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %ecx -; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %edi -; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %esi, %edx -; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %esi, %ebp -; FALLBACK30-NEXT: shrl %ebp -; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, (%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) -; FALLBACK30-NEXT: movl %edx, 12(%esi) -; FALLBACK30-NEXT: addl $44, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: shl_16bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $44, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: vmovups (%edx), %xmm0 -; FALLBACK31-NEXT: movzbl (%ecx), %edx -; FALLBACK31-NEXT: movl %edx, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovaps %xmm1, (%esp) -; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $12, %dl -; FALLBACK31-NEXT: negb %dl -; FALLBACK31-NEXT: movsbl %dl, %edi -; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi -; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx -; FALLBACK31-NEXT: shldl %cl, %esi, %edx -; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx -; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi -; FALLBACK31-NEXT: shldl %cl, %edi, %esi -; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: shldl %cl, %ebx, %edi -; FALLBACK31-NEXT: movl %edi, 4(%eax) -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl %edx, 12(%eax) -; FALLBACK31-NEXT: movl %ebp, (%eax) -; FALLBACK31-NEXT: addl $44, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: retl +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_16bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %dh +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dh +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ah, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_16bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $32, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ch, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $32, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_16bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %al, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edi, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_16bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 4(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 12(%edx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_16bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%edx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, (%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %dl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %dl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbl %dl, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, (%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_16bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, (%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%esp,%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%edx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: shl_16bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $12, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movsbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 4(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 12(%edx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_16bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $12, %dl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %dl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbl %dl, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, (%eax) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_16bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%esp,%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 %bitOff = shl i128 %byteOff, 3 @@ -2833,31 +2453,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl -; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl -; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%edi), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%edi), %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edi, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi @@ -3108,1944 +2728,1477 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no } define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: lshr_32bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: lshr_32bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: lshr_32bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: lshr_32bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: lshr_32bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movzbl (%rsi), %ecx -; FALLBACK4-NEXT: leal (,%rcx,8), %eax -; FALLBACK4-NEXT: xorps %xmm2, %xmm2 -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $24, %cl -; FALLBACK4-NEXT: movzbl %cl, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: lshr_32bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movzbl (%rsi), %eax -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: xorps %xmm2, %xmm2 -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $24, %al -; FALLBACK5-NEXT: movzbl %al, %eax -; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK5-NEXT: movq %rdi, %r8 -; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shrq %cl, %rsi -; FALLBACK5-NEXT: movq %r10, 8(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: lshr_32bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: leal (,%rcx,8), %eax -; FALLBACK6-NEXT: xorps %xmm2, %xmm2 -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: lshr_32bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movzbl (%rsi), %eax -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: xorps %xmm2, %xmm2 -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $24, %al -; FALLBACK7-NEXT: movzbl %al, %eax -; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK7-NEXT: movq %rdi, %r8 -; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK7-NEXT: movq %r10, 8(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: lshr_32bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: movzbl (%rsi), %ecx -; FALLBACK8-NEXT: leal (,%rcx,8), %eax -; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $24, %cl -; FALLBACK8-NEXT: movzbl %cl, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: lshr_32bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: movzbl (%rsi), %eax -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $24, %al -; FALLBACK9-NEXT: movzbl %al, %eax -; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq %rdi, %r8 -; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shrq %cl, %rsi -; FALLBACK9-NEXT: movq %r10, 8(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: lshr_32bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax -; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: lshr_32bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: movzbl (%rsi), %eax -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $24, %al -; FALLBACK11-NEXT: movzbl %al, %eax -; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq %rdi, %r8 -; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK11-NEXT: movq %r10, 8(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: lshr_32bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: movzbl (%rsi), %ecx -; FALLBACK12-NEXT: leal (,%rcx,8), %eax -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $24, %cl -; FALLBACK12-NEXT: movzbl %cl, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: lshr_32bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: movzbl (%rsi), %eax -; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $24, %al -; FALLBACK13-NEXT: movzbl %al, %eax -; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK13-NEXT: movq %rdi, %r8 -; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shrq %cl, %rsi -; FALLBACK13-NEXT: movq %r10, 8(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: lshr_32bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: lshr_32bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: movzbl (%rsi), %eax -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $24, %al -; FALLBACK15-NEXT: movzbl %al, %eax -; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq %rdi, %r8 -; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK15-NEXT: movq %r10, 8(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: lshr_32bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $108, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK16-NEXT: movl (%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ebp), %edi -; FALLBACK16-NEXT: movl 16(%ebp), %ebx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movl 20(%ebp), %esi -; FALLBACK16-NEXT: movl 24(%ebp), %ecx -; FALLBACK16-NEXT: movl 28(%ebp), %ebp -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movb %ah, %dh -; FALLBACK16-NEXT: shlb $3, %dh -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $28, %ah -; FALLBACK16-NEXT: movzbl %ah, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movb %dh, %dl -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: addl %eax, %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %esi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl 48(%esp,%eax), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%eax,%eax), %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %esi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 52(%esp,%eax), %edi -; FALLBACK16-NEXT: movl %edi, %ebx -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl 56(%esp,%eax), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %eax -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebx, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %dh, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %ebx, 28(%eax) -; FALLBACK16-NEXT: movl %esi, 24(%eax) -; FALLBACK16-NEXT: movl %edi, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl %ebp, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, (%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $108, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: lshr_32bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $92, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl (%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ebp), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ebp), %esi -; FALLBACK17-NEXT: movl 12(%ebp), %edi -; FALLBACK17-NEXT: movl 16(%ebp), %ebx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movl 20(%ebp), %edx -; FALLBACK17-NEXT: movl 24(%ebp), %eax -; FALLBACK17-NEXT: movl 28(%ebp), %ebp -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movb %ch, %cl -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $28, %ch -; FALLBACK17-NEXT: movzbl %ch, %ebp -; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 24(%ebp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %esi -; FALLBACK17-NEXT: shrl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl %ebx, 16(%ebp) -; FALLBACK17-NEXT: movl %edi, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %esi, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $92, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: lshr_32bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %esi -; FALLBACK18-NEXT: movl 12(%eax), %edi -; FALLBACK18-NEXT: movl 16(%eax), %ebp -; FALLBACK18-NEXT: movzbl (%ebx), %ebx -; FALLBACK18-NEXT: movl 20(%eax), %edx -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl 28(%eax), %eax -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: movzbl %bl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %esi, %edx -; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %ebx, %ecx -; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) -; FALLBACK18-NEXT: addl $108, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: lshr_32bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $92, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %esi -; FALLBACK19-NEXT: movl 12(%ecx), %edi -; FALLBACK19-NEXT: movl 16(%ecx), %ebp -; FALLBACK19-NEXT: movzbl (%ebx), %ebx -; FALLBACK19-NEXT: movl 20(%ecx), %edx -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl 28(%ecx), %ecx -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $28, %bl -; FALLBACK19-NEXT: movzbl %bl, %ebp -; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %esi, %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl %edx, %esi -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi -; FALLBACK19-NEXT: shrdl %cl, %edi, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, %edi, %eax -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl %ebx, 16(%ebp) -; FALLBACK19-NEXT: movl %esi, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 4(%ebp) -; FALLBACK19-NEXT: addl $92, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: lshr_32bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $108, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movl %ecx, %eax -; FALLBACK20-NEXT: shlb $3, %al -; FALLBACK20-NEXT: xorps %xmm2, %xmm2 -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $28, %cl -; FALLBACK20-NEXT: movzbl %cl, %edi -; FALLBACK20-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %esi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %ebx -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %ebx, 28(%eax) -; FALLBACK20-NEXT: movl %esi, 4(%eax) -; FALLBACK20-NEXT: movl %edi, 24(%eax) -; FALLBACK20-NEXT: movl %ebp, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, (%eax) -; FALLBACK20-NEXT: addl $108, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: lshr_32bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $108, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK21-NEXT: movzbl (%eax), %eax -; FALLBACK21-NEXT: movl %eax, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: xorps %xmm2, %xmm2 -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $28, %al -; FALLBACK21-NEXT: movzbl %al, %ebp -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl %edi, %esi -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %esi, 4(%ebp) -; FALLBACK21-NEXT: movl %ebx, 24(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: shrl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %edx, (%ebp) -; FALLBACK21-NEXT: addl $108, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: lshr_32bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $108, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlb $3, %dl -; FALLBACK22-NEXT: xorps %xmm2, %xmm2 -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %edx, %eax -; FALLBACK22-NEXT: notb %al -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl %eax, %ebp -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK22-NEXT: movl %ebp, %ecx -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx -; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebp, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 8(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, (%edx) -; FALLBACK22-NEXT: addl $108, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: lshr_32bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $108, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK23-NEXT: movzbl (%eax), %eax -; FALLBACK23-NEXT: movl %eax, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: xorps %xmm2, %xmm2 -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $28, %al -; FALLBACK23-NEXT: movzbl %al, %ebx -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edi -; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %ebx, 4(%eax) -; FALLBACK23-NEXT: movl %ebp, 24(%eax) -; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK23-NEXT: movl %ebx, 28(%eax) -; FALLBACK23-NEXT: movl %esi, 16(%eax) -; FALLBACK23-NEXT: movl %edi, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 12(%eax) -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, (%eax) -; FALLBACK23-NEXT: addl $108, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: lshr_32bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $108, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movl %ecx, %eax -; FALLBACK24-NEXT: shlb $3, %al -; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $28, %cl -; FALLBACK24-NEXT: movzbl %cl, %edi -; FALLBACK24-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %esi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %ebx -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %ebx, 28(%eax) -; FALLBACK24-NEXT: movl %esi, 4(%eax) -; FALLBACK24-NEXT: movl %edi, 24(%eax) -; FALLBACK24-NEXT: movl %ebp, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, (%eax) -; FALLBACK24-NEXT: addl $108, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: lshr_32bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $108, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK25-NEXT: movzbl (%eax), %eax -; FALLBACK25-NEXT: movl %eax, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $28, %al -; FALLBACK25-NEXT: movzbl %al, %ebp -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl %edi, %esi -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %esi, 4(%ebp) -; FALLBACK25-NEXT: movl %ebx, 24(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: shrl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %edx, (%ebp) -; FALLBACK25-NEXT: addl $108, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: lshr_32bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $108, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlb $3, %dl -; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %edx, %eax -; FALLBACK26-NEXT: notb %al -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl %eax, %ebp -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK26-NEXT: movl %ebp, %ecx -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx -; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebp, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 8(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, (%edx) -; FALLBACK26-NEXT: addl $108, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: lshr_32bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $108, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK27-NEXT: movzbl (%eax), %eax -; FALLBACK27-NEXT: movl %eax, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $28, %al -; FALLBACK27-NEXT: movzbl %al, %ebx -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edi -; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %ebx, 4(%eax) -; FALLBACK27-NEXT: movl %ebp, 24(%eax) -; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK27-NEXT: movl %ebx, 28(%eax) -; FALLBACK27-NEXT: movl %esi, 16(%eax) -; FALLBACK27-NEXT: movl %edi, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 12(%eax) -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, (%eax) -; FALLBACK27-NEXT: addl $108, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: lshr_32bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $108, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movl %ecx, %eax -; FALLBACK28-NEXT: shlb $3, %al -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $28, %cl -; FALLBACK28-NEXT: movzbl %cl, %edi -; FALLBACK28-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %esi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %ebx -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %ebx, 28(%eax) -; FALLBACK28-NEXT: movl %esi, 4(%eax) -; FALLBACK28-NEXT: movl %edi, 24(%eax) -; FALLBACK28-NEXT: movl %ebp, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, (%eax) -; FALLBACK28-NEXT: addl $108, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: lshr_32bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $108, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK29-NEXT: movzbl (%eax), %eax -; FALLBACK29-NEXT: movl %eax, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $28, %al -; FALLBACK29-NEXT: movzbl %al, %ebp -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl %edi, %esi -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %esi, 4(%ebp) -; FALLBACK29-NEXT: movl %ebx, 24(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: shrl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %edx, (%ebp) -; FALLBACK29-NEXT: addl $108, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: lshr_32bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $108, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlb $3, %dl -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %edx, %eax -; FALLBACK30-NEXT: notb %al -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl %eax, %ebp -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK30-NEXT: movl %ebp, %ecx -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx -; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebp, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 8(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, (%edx) -; FALLBACK30-NEXT: addl $108, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: lshr_32bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $108, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK31-NEXT: movzbl (%eax), %eax -; FALLBACK31-NEXT: movl %eax, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $28, %al -; FALLBACK31-NEXT: movzbl %al, %ebx -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edi -; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %ebx, 4(%eax) -; FALLBACK31-NEXT: movl %ebp, 24(%eax) -; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK31-NEXT: movl %ebx, 28(%eax) -; FALLBACK31-NEXT: movl %esi, 16(%eax) -; FALLBACK31-NEXT: movl %edi, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 12(%eax) -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, (%eax) -; FALLBACK31-NEXT: addl $108, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %r9d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %cl +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %cl, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rcx), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rcx), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rcx), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rcx,%rcx), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rcx, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %r9d +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %dh +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dh +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ah, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%edi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ch, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%ebp,%ebp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 20(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 32(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 @@ -5055,591 +4208,452 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { } define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: lshr_32bytes_dwordOff: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: movl %esi, %eax -; FALLBACK0-NEXT: shlb $5, %al -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $6, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: lshr_32bytes_dwordOff: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: movl %esi, %ecx -; FALLBACK1-NEXT: shlb $5, %cl -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $6, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: lshr_32bytes_dwordOff: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: movl %esi, %eax -; FALLBACK2-NEXT: shlb $5, %al -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: lshr_32bytes_dwordOff: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: movl %esi, %ecx -; FALLBACK3-NEXT: shlb $5, %cl -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $6, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: lshr_32bytes_dwordOff: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movzbl (%rsi), %ecx -; FALLBACK4-NEXT: movl %ecx, %eax -; FALLBACK4-NEXT: shlb $5, %al -; FALLBACK4-NEXT: xorps %xmm2, %xmm2 -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $6, %cl -; FALLBACK4-NEXT: movzbl %cl, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: lshr_32bytes_dwordOff: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movzbl (%rsi), %eax -; FALLBACK5-NEXT: movl %eax, %ecx -; FALLBACK5-NEXT: shlb $5, %cl -; FALLBACK5-NEXT: xorps %xmm2, %xmm2 -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $6, %al -; FALLBACK5-NEXT: movzbl %al, %eax -; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK5-NEXT: movq %rdi, %r8 -; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK5-NEXT: movq %rax, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK5-NEXT: shrq %cl, %rsi -; FALLBACK5-NEXT: movq %r10, 8(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: lshr_32bytes_dwordOff: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: movl %ecx, %eax -; FALLBACK6-NEXT: shlb $5, %al -; FALLBACK6-NEXT: xorps %xmm2, %xmm2 -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $6, %cl -; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: lshr_32bytes_dwordOff: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movzbl (%rsi), %eax -; FALLBACK7-NEXT: movl %eax, %ecx -; FALLBACK7-NEXT: shlb $5, %cl -; FALLBACK7-NEXT: xorps %xmm2, %xmm2 -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $6, %al -; FALLBACK7-NEXT: movzbl %al, %eax -; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK7-NEXT: movq %rdi, %r8 -; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK7-NEXT: movq %rax, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK7-NEXT: movq %r10, 8(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: lshr_32bytes_dwordOff: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: movzbl (%rsi), %ecx -; FALLBACK8-NEXT: movl %ecx, %eax -; FALLBACK8-NEXT: shlb $5, %al -; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $6, %cl -; FALLBACK8-NEXT: movzbl %cl, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: lshr_32bytes_dwordOff: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: movzbl (%rsi), %eax -; FALLBACK9-NEXT: movl %eax, %ecx -; FALLBACK9-NEXT: shlb $5, %cl -; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $6, %al -; FALLBACK9-NEXT: movzbl %al, %eax -; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK9-NEXT: movq %rdi, %r8 -; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK9-NEXT: movq %rax, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK9-NEXT: shrq %cl, %rsi -; FALLBACK9-NEXT: movq %r10, 8(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: lshr_32bytes_dwordOff: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax -; FALLBACK10-NEXT: shlb $5, %al -; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $6, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: lshr_32bytes_dwordOff: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: movzbl (%rsi), %eax -; FALLBACK11-NEXT: movl %eax, %ecx -; FALLBACK11-NEXT: shlb $5, %cl -; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $6, %al -; FALLBACK11-NEXT: movzbl %al, %eax -; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK11-NEXT: movq %rdi, %r8 -; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK11-NEXT: movq %rax, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK11-NEXT: movq %r10, 8(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: lshr_32bytes_dwordOff: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: movzbl (%rsi), %ecx -; FALLBACK12-NEXT: movl %ecx, %eax -; FALLBACK12-NEXT: shlb $5, %al -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $6, %cl -; FALLBACK12-NEXT: movzbl %cl, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: lshr_32bytes_dwordOff: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: movzbl (%rsi), %eax -; FALLBACK13-NEXT: movl %eax, %ecx -; FALLBACK13-NEXT: shlb $5, %cl -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $6, %al -; FALLBACK13-NEXT: movzbl %al, %eax -; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK13-NEXT: movq %rdi, %r8 -; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK13-NEXT: movq %rax, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK13-NEXT: shrq %cl, %rsi -; FALLBACK13-NEXT: movq %r10, 8(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: lshr_32bytes_dwordOff: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax -; FALLBACK14-NEXT: shlb $5, %al -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $6, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: lshr_32bytes_dwordOff: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: movzbl (%rsi), %eax -; FALLBACK15-NEXT: movl %eax, %ecx -; FALLBACK15-NEXT: shlb $5, %cl -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $6, %al -; FALLBACK15-NEXT: movzbl %al, %eax -; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK15-NEXT: movq %rdi, %r8 -; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK15-NEXT: movq %rax, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax -; FALLBACK15-NEXT: movq %r10, 8(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9,4), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9,4), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %r9d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9,4), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %cl +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %cl, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rcx,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rcx,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rcx,4), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rcx,%rcx), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rcx, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $6, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %r9d +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9,4), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $6, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq ; ; X86-SSE2-LABEL: lshr_32bytes_dwordOff: ; X86-SSE2: # %bb.0: @@ -5922,1955 +4936,1495 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no } define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: shl_32bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: negb %sil -; FALLBACK0-NEXT: movsbq %sil, %r10 -; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 -; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 -; FALLBACK0-NEXT: shrq %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 -; FALLBACK0-NEXT: movq %r10, %rbx -; FALLBACK0-NEXT: shrq %rbx -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: orq %r11, %rbx -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: shrq %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) -; FALLBACK0-NEXT: movq %rbx, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: shl_32bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: negb %sil -; FALLBACK1-NEXT: movsbq %sil, %rax -; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: shldq %cl, %r8, %rax -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shlq %cl, %r8 -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rdi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rax, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: shl_32bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: shl_32bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: negb %sil -; FALLBACK3-NEXT: movsbq %sil, %rax -; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: shldq %cl, %r8, %rax -; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rdi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rax, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: shl_32bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movzbl (%rsi), %ecx -; FALLBACK4-NEXT: leal (,%rcx,8), %eax -; FALLBACK4-NEXT: xorps %xmm2, %xmm2 -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $24, %cl -; FALLBACK4-NEXT: negb %cl -; FALLBACK4-NEXT: movsbq %cl, %r8 -; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq %r10, %rdi -; FALLBACK4-NEXT: shrq %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rdi -; FALLBACK4-NEXT: orq %r9, %rdi -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK4-NEXT: movq %r8, %r11 -; FALLBACK4-NEXT: shrq %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r8 -; FALLBACK4-NEXT: movq %r9, %r10 -; FALLBACK4-NEXT: shrq %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, (%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %r11, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, 24(%rdx) -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: shl_32bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movzbl (%rsi), %eax -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: xorps %xmm2, %xmm2 -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $24, %al -; FALLBACK5-NEXT: negb %al -; FALLBACK5-NEXT: movsbq %al, %rax -; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK5-NEXT: shldq %cl, %rax, %rsi -; FALLBACK5-NEXT: movq %r8, %r9 -; FALLBACK5-NEXT: shlq %cl, %r9 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shldq %cl, %r8, %rax -; FALLBACK5-NEXT: movq %rax, 8(%rdx) -; FALLBACK5-NEXT: movq %rsi, 16(%rdx) -; FALLBACK5-NEXT: movq %rdi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: shl_32bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: leal (,%rcx,8), %eax -; FALLBACK6-NEXT: xorps %xmm2, %xmm2 -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: shl_32bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movzbl (%rsi), %eax -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: xorps %xmm2, %xmm2 -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $24, %al -; FALLBACK7-NEXT: negb %al -; FALLBACK7-NEXT: movsbq %al, %rax -; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK7-NEXT: shldq %cl, %rax, %rsi -; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r8, %rax -; FALLBACK7-NEXT: movq %rax, 8(%rdx) -; FALLBACK7-NEXT: movq %rsi, 16(%rdx) -; FALLBACK7-NEXT: movq %rdi, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: shl_32bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: movzbl (%rsi), %ecx -; FALLBACK8-NEXT: leal (,%rcx,8), %eax -; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $24, %cl -; FALLBACK8-NEXT: negb %cl -; FALLBACK8-NEXT: movsbq %cl, %r8 -; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK8-NEXT: movq %r10, %rdi -; FALLBACK8-NEXT: shrq %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rdi -; FALLBACK8-NEXT: orq %r9, %rdi -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK8-NEXT: movq %r8, %r11 -; FALLBACK8-NEXT: shrq %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r8 -; FALLBACK8-NEXT: movq %r9, %r10 -; FALLBACK8-NEXT: shrq %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, (%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %r11, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, 24(%rdx) -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: shl_32bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: movzbl (%rsi), %eax -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $24, %al -; FALLBACK9-NEXT: negb %al -; FALLBACK9-NEXT: movsbq %al, %rax -; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK9-NEXT: shldq %cl, %rax, %rsi -; FALLBACK9-NEXT: movq %r8, %r9 -; FALLBACK9-NEXT: shlq %cl, %r9 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shldq %cl, %r8, %rax -; FALLBACK9-NEXT: movq %rax, 8(%rdx) -; FALLBACK9-NEXT: movq %rsi, 16(%rdx) -; FALLBACK9-NEXT: movq %rdi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: shl_32bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax -; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: shl_32bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: movzbl (%rsi), %eax -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $24, %al -; FALLBACK11-NEXT: negb %al -; FALLBACK11-NEXT: movsbq %al, %rax -; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK11-NEXT: shldq %cl, %rax, %rsi -; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r8, %rax -; FALLBACK11-NEXT: movq %rax, 8(%rdx) -; FALLBACK11-NEXT: movq %rsi, 16(%rdx) -; FALLBACK11-NEXT: movq %rdi, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: shl_32bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: movzbl (%rsi), %ecx -; FALLBACK12-NEXT: leal (,%rcx,8), %eax -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $24, %cl -; FALLBACK12-NEXT: negb %cl -; FALLBACK12-NEXT: movsbq %cl, %r8 -; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK12-NEXT: movq %r10, %rdi -; FALLBACK12-NEXT: shrq %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rdi -; FALLBACK12-NEXT: orq %r9, %rdi -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK12-NEXT: movq %r8, %r11 -; FALLBACK12-NEXT: shrq %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r8 -; FALLBACK12-NEXT: movq %r9, %r10 -; FALLBACK12-NEXT: shrq %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, (%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %r11, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, 24(%rdx) -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: shl_32bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: movzbl (%rsi), %eax -; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $24, %al -; FALLBACK13-NEXT: negb %al -; FALLBACK13-NEXT: movsbq %al, %rax -; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK13-NEXT: shldq %cl, %rax, %rsi -; FALLBACK13-NEXT: movq %r8, %r9 -; FALLBACK13-NEXT: shlq %cl, %r9 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shldq %cl, %r8, %rax -; FALLBACK13-NEXT: movq %rax, 8(%rdx) -; FALLBACK13-NEXT: movq %rsi, 16(%rdx) -; FALLBACK13-NEXT: movq %rdi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: shl_32bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: shl_32bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: movzbl (%rsi), %eax -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $24, %al -; FALLBACK15-NEXT: negb %al -; FALLBACK15-NEXT: movsbq %al, %rax -; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK15-NEXT: shldq %cl, %rax, %rsi -; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r8, %rax -; FALLBACK15-NEXT: movq %rax, 8(%rdx) -; FALLBACK15-NEXT: movq %rsi, 16(%rdx) -; FALLBACK15-NEXT: movq %rdi, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: shl_32bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $108, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ecx), %esi -; FALLBACK16-NEXT: movl 12(%ecx), %edi -; FALLBACK16-NEXT: movl 16(%ecx), %ebx -; FALLBACK16-NEXT: movb (%eax), %ah -; FALLBACK16-NEXT: movl 20(%ecx), %ebp -; FALLBACK16-NEXT: movl 24(%ecx), %edx -; FALLBACK16-NEXT: movl 28(%ecx), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movb %ah, %ch -; FALLBACK16-NEXT: shlb $3, %ch -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $28, %ah -; FALLBACK16-NEXT: negb %ah -; FALLBACK16-NEXT: movsbl %ah, %ebx -; FALLBACK16-NEXT: movl 64(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 68(%esp,%ebx), %eax -; FALLBACK16-NEXT: movl %eax, %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movb %ch, %dl -; FALLBACK16-NEXT: notb %dl -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl 72(%esp,%ebx), %esi -; FALLBACK16-NEXT: movl %esi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %eax -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 84(%esp,%ebx), %esi -; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %eax -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: orl %edi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%ebx), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: movl 88(%esp,%ebx), %edi -; FALLBACK16-NEXT: movl %edi, %ebx -; FALLBACK16-NEXT: shrl %ebx -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: orl %eax, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %edi, %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 24(%eax) -; FALLBACK16-NEXT: movl %ebx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl %ebp, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $108, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: shl_32bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $92, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl (%eax), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%eax), %edx -; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%eax), %esi -; FALLBACK17-NEXT: movl 12(%eax), %edi -; FALLBACK17-NEXT: movl 16(%eax), %ebx -; FALLBACK17-NEXT: movb (%ecx), %ch -; FALLBACK17-NEXT: movl 20(%eax), %ebp -; FALLBACK17-NEXT: movl 24(%eax), %edx -; FALLBACK17-NEXT: movl 28(%eax), %eax -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movb %ch, %cl -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $28, %ch -; FALLBACK17-NEXT: negb %ch -; FALLBACK17-NEXT: movsbl %ch, %eax -; FALLBACK17-NEXT: movl 56(%esp,%eax), %edx -; FALLBACK17-NEXT: movl 60(%esp,%eax), %ebx -; FALLBACK17-NEXT: movl %ebx, %esi -; FALLBACK17-NEXT: shldl %cl, %edx, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 52(%esp,%eax), %esi -; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%eax), %edi -; FALLBACK17-NEXT: movl 68(%esp,%eax), %ebp -; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edi, %ebp -; FALLBACK17-NEXT: shldl %cl, %ebx, %edi -; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebx -; FALLBACK17-NEXT: movl 72(%esp,%eax), %edx -; FALLBACK17-NEXT: movl 76(%esp,%eax), %esi -; FALLBACK17-NEXT: shldl %cl, %edx, %esi -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl %edx, 24(%eax) -; FALLBACK17-NEXT: movl %esi, 28(%eax) -; FALLBACK17-NEXT: movl %edi, 16(%eax) -; FALLBACK17-NEXT: movl %ebp, 20(%eax) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, 8(%eax) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, 12(%eax) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %ebx, %edx -; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: movl %ebx, (%eax) -; FALLBACK17-NEXT: movl %edx, 4(%eax) -; FALLBACK17-NEXT: addl $92, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: shl_32bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %esi -; FALLBACK18-NEXT: movl 12(%eax), %edi -; FALLBACK18-NEXT: movl 16(%eax), %ebp -; FALLBACK18-NEXT: movzbl (%ebx), %ebx -; FALLBACK18-NEXT: movl 20(%eax), %edx -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl 28(%eax), %eax -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %edx -; FALLBACK18-NEXT: shlb $3, %dl -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: negb %bl -; FALLBACK18-NEXT: movsbl %bl, %esi -; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %eax, %edi -; FALLBACK18-NEXT: movl %edx, %ecx -; FALLBACK18-NEXT: notb %cl -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx -; FALLBACK18-NEXT: orl %edi, %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, %edi -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax -; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp -; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK18-NEXT: orl %ebp, %esi -; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) -; FALLBACK18-NEXT: addl $108, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: shl_32bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $92, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %esi -; FALLBACK19-NEXT: movl 12(%ecx), %edi -; FALLBACK19-NEXT: movl 16(%ecx), %ebp -; FALLBACK19-NEXT: movzbl (%ebx), %ebx -; FALLBACK19-NEXT: movl 20(%ecx), %edx -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl 28(%ecx), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $28, %bl -; FALLBACK19-NEXT: negb %bl -; FALLBACK19-NEXT: movsbl %bl, %eax -; FALLBACK19-NEXT: movl 56(%esp,%eax), %edx -; FALLBACK19-NEXT: movl 60(%esp,%eax), %esi -; FALLBACK19-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edx, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 52(%esp,%eax), %ebx -; FALLBACK19-NEXT: shldl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 64(%esp,%eax), %edi -; FALLBACK19-NEXT: movl 68(%esp,%eax), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edi, %ebp -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl 48(%esp,%eax), %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 72(%esp,%eax), %edx -; FALLBACK19-NEXT: movl 76(%esp,%eax), %esi -; FALLBACK19-NEXT: shldl %cl, %edx, %esi -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %eax, %edx -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl %edx, 24(%eax) -; FALLBACK19-NEXT: movl %esi, 28(%eax) -; FALLBACK19-NEXT: movl %edi, 16(%eax) -; FALLBACK19-NEXT: movl %ebp, 20(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, 8(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, 12(%eax) -; FALLBACK19-NEXT: movl (%esp), %esi # 4-byte Reload -; FALLBACK19-NEXT: shlxl %ecx, %esi, %edx -; FALLBACK19-NEXT: movl %edx, (%eax) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: shldl %cl, %esi, %ebx -; FALLBACK19-NEXT: movl %ebx, 4(%eax) -; FALLBACK19-NEXT: addl $92, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: shl_32bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $108, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movzbl (%eax), %ecx -; FALLBACK20-NEXT: movb %cl, %dh -; FALLBACK20-NEXT: shlb $3, %dh -; FALLBACK20-NEXT: xorps %xmm2, %xmm2 -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $28, %cl -; FALLBACK20-NEXT: negb %cl -; FALLBACK20-NEXT: movsbl %cl, %ebx -; FALLBACK20-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movb %dh, %dl -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK20-NEXT: movl %esi, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %edi, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl %ebx, %edi -; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK20-NEXT: movl %ebp, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %esi, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK20-NEXT: movl %ebx, %eax -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %esi -; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: movl 64(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: shrl %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %eax -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: orl %edi, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %edx, (%eax) -; FALLBACK20-NEXT: movl %ebp, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) -; FALLBACK20-NEXT: movl %ebx, 4(%eax) -; FALLBACK20-NEXT: movl %esi, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: addl $108, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: shl_32bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $92, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK21-NEXT: movzbl (%eax), %eax -; FALLBACK21-NEXT: movl %eax, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: xorps %xmm2, %xmm2 -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $28, %al -; FALLBACK21-NEXT: negb %al -; FALLBACK21-NEXT: movsbl %al, %ebp -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx -; FALLBACK21-NEXT: shldl %cl, %edx, %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi -; FALLBACK21-NEXT: shldl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK21-NEXT: shldl %cl, %ebx, %edi -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, %eax -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %esi, %eax -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK21-NEXT: shldl %cl, %edx, %ebp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK21-NEXT: movl %ebp, 28(%edx) -; FALLBACK21-NEXT: movl %eax, 24(%edx) -; FALLBACK21-NEXT: movl %esi, %eax -; FALLBACK21-NEXT: shll %cl, %eax -; FALLBACK21-NEXT: shldl %cl, %esi, %ebx -; FALLBACK21-NEXT: movl %ebx, 4(%edx) -; FALLBACK21-NEXT: movl %edi, 8(%edx) -; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 12(%edx) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 16(%edx) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK21-NEXT: movl %ecx, 20(%edx) -; FALLBACK21-NEXT: movl %eax, (%edx) -; FALLBACK21-NEXT: addl $92, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: shl_32bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $108, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: xorps %xmm2, %xmm2 -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %edx -; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, %esi -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %edi -; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %edi -; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ecx, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi -; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %edi, %edx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK22-NEXT: movl %edi, (%esi) -; FALLBACK22-NEXT: movl %edx, 28(%esi) -; FALLBACK22-NEXT: movl %eax, 24(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%esi) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 16(%esi) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%esi) -; FALLBACK22-NEXT: addl $108, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: shl_32bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $92, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK23-NEXT: movzbl (%eax), %eax -; FALLBACK23-NEXT: movl %eax, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: xorps %xmm2, %xmm2 -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $28, %al -; FALLBACK23-NEXT: negb %al -; FALLBACK23-NEXT: movsbl %al, %ebx -; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %edx -; FALLBACK23-NEXT: shldl %cl, %edx, %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK23-NEXT: shldl %cl, %edi, %edx -; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK23-NEXT: shldl %cl, %ebp, %edi -; FALLBACK23-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, %eax -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %esi, %eax -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK23-NEXT: shldl %cl, %edx, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK23-NEXT: movl %ebx, 28(%edx) -; FALLBACK23-NEXT: movl %eax, 24(%edx) -; FALLBACK23-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: shldl %cl, %esi, %ebp -; FALLBACK23-NEXT: movl %ebp, 4(%edx) -; FALLBACK23-NEXT: movl %edi, 8(%edx) -; FALLBACK23-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 12(%edx) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 16(%edx) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 20(%edx) -; FALLBACK23-NEXT: movl %eax, (%edx) -; FALLBACK23-NEXT: addl $92, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: shl_32bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $108, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: movzbl (%eax), %ecx -; FALLBACK24-NEXT: movb %cl, %dh -; FALLBACK24-NEXT: shlb $3, %dh -; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $28, %cl -; FALLBACK24-NEXT: negb %cl -; FALLBACK24-NEXT: movsbl %cl, %ebx -; FALLBACK24-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movb %dh, %dl -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK24-NEXT: movl %esi, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %edi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl %ebx, %edi -; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %esi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK24-NEXT: movl %ebx, %eax -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %esi -; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: movl 64(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: shrl %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %eax -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: orl %edi, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %edx, (%eax) -; FALLBACK24-NEXT: movl %ebp, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) -; FALLBACK24-NEXT: movl %ebx, 4(%eax) -; FALLBACK24-NEXT: movl %esi, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: addl $108, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: shl_32bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $92, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK25-NEXT: movzbl (%eax), %eax -; FALLBACK25-NEXT: movl %eax, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $28, %al -; FALLBACK25-NEXT: negb %al -; FALLBACK25-NEXT: movsbl %al, %ebp -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx -; FALLBACK25-NEXT: shldl %cl, %edx, %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi -; FALLBACK25-NEXT: shldl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK25-NEXT: shldl %cl, %ebx, %edi -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, %eax -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %esi, %eax -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK25-NEXT: shldl %cl, %edx, %ebp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK25-NEXT: movl %ebp, 28(%edx) -; FALLBACK25-NEXT: movl %eax, 24(%edx) -; FALLBACK25-NEXT: movl %esi, %eax -; FALLBACK25-NEXT: shll %cl, %eax -; FALLBACK25-NEXT: shldl %cl, %esi, %ebx -; FALLBACK25-NEXT: movl %ebx, 4(%edx) -; FALLBACK25-NEXT: movl %edi, 8(%edx) -; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 12(%edx) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 16(%edx) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK25-NEXT: movl %ecx, 20(%edx) -; FALLBACK25-NEXT: movl %eax, (%edx) -; FALLBACK25-NEXT: addl $92, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: shl_32bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $108, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %edx -; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi -; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, %esi -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %edi -; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %edi -; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ecx, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi -; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %edi, %edx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK26-NEXT: movl %edi, (%esi) -; FALLBACK26-NEXT: movl %edx, 28(%esi) -; FALLBACK26-NEXT: movl %eax, 24(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%esi) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%esi) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%esi) -; FALLBACK26-NEXT: addl $108, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: shl_32bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $92, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK27-NEXT: movzbl (%eax), %eax -; FALLBACK27-NEXT: movl %eax, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $28, %al -; FALLBACK27-NEXT: negb %al -; FALLBACK27-NEXT: movsbl %al, %ebx -; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %edx -; FALLBACK27-NEXT: shldl %cl, %edx, %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK27-NEXT: shldl %cl, %edi, %edx -; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK27-NEXT: shldl %cl, %ebp, %edi -; FALLBACK27-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, %eax -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: shldl %cl, %esi, %eax -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK27-NEXT: shldl %cl, %edx, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK27-NEXT: movl %ebx, 28(%edx) -; FALLBACK27-NEXT: movl %eax, 24(%edx) -; FALLBACK27-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: shldl %cl, %esi, %ebp -; FALLBACK27-NEXT: movl %ebp, 4(%edx) -; FALLBACK27-NEXT: movl %edi, 8(%edx) -; FALLBACK27-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 12(%edx) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 16(%edx) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 20(%edx) -; FALLBACK27-NEXT: movl %eax, (%edx) -; FALLBACK27-NEXT: addl $92, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: shl_32bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $108, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK28-NEXT: movzbl (%eax), %ecx -; FALLBACK28-NEXT: movb %cl, %dh -; FALLBACK28-NEXT: shlb $3, %dh -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $28, %cl -; FALLBACK28-NEXT: negb %cl -; FALLBACK28-NEXT: movsbl %cl, %ebx -; FALLBACK28-NEXT: movl 84(%esp,%ebx), %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movb %dh, %dl -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: movl 80(%esp,%ebx), %esi -; FALLBACK28-NEXT: movl %esi, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %edi, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl %ebx, %edi -; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %esi, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx -; FALLBACK28-NEXT: movl %ebx, %eax -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 68(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %esi -; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: movl 64(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: shrl %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl 88(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %eax -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: orl %edi, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %edx, (%eax) -; FALLBACK28-NEXT: movl %ebp, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) -; FALLBACK28-NEXT: movl %ebx, 4(%eax) -; FALLBACK28-NEXT: movl %esi, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: addl $108, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: shl_32bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $92, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK29-NEXT: movzbl (%eax), %eax -; FALLBACK29-NEXT: movl %eax, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $28, %al -; FALLBACK29-NEXT: negb %al -; FALLBACK29-NEXT: movsbl %al, %ebp -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx -; FALLBACK29-NEXT: shldl %cl, %edx, %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi -; FALLBACK29-NEXT: shldl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx -; FALLBACK29-NEXT: shldl %cl, %ebx, %edi -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, %eax -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %esi, %eax -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %ebp -; FALLBACK29-NEXT: shldl %cl, %edx, %ebp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK29-NEXT: movl %ebp, 28(%edx) -; FALLBACK29-NEXT: movl %eax, 24(%edx) -; FALLBACK29-NEXT: movl %esi, %eax -; FALLBACK29-NEXT: shll %cl, %eax -; FALLBACK29-NEXT: shldl %cl, %esi, %ebx -; FALLBACK29-NEXT: movl %ebx, 4(%edx) -; FALLBACK29-NEXT: movl %edi, 8(%edx) -; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 12(%edx) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 16(%edx) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK29-NEXT: movl %ecx, 20(%edx) -; FALLBACK29-NEXT: movl %eax, (%edx) -; FALLBACK29-NEXT: addl $92, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: shl_32bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $108, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %edx -; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi -; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, %esi -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %edi -; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %edi -; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ecx, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi -; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %edi, %edx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK30-NEXT: movl %edi, (%esi) -; FALLBACK30-NEXT: movl %edx, 28(%esi) -; FALLBACK30-NEXT: movl %eax, 24(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%esi) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%esi) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%esi) -; FALLBACK30-NEXT: addl $108, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: shl_32bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $92, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK31-NEXT: movzbl (%eax), %eax -; FALLBACK31-NEXT: movl %eax, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $28, %al -; FALLBACK31-NEXT: negb %al -; FALLBACK31-NEXT: movsbl %al, %ebx -; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %edx -; FALLBACK31-NEXT: shldl %cl, %edx, %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi -; FALLBACK31-NEXT: shldl %cl, %edi, %edx -; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp -; FALLBACK31-NEXT: shldl %cl, %ebp, %edi -; FALLBACK31-NEXT: movl 72(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, %eax -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: shldl %cl, %esi, %eax -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx -; FALLBACK31-NEXT: shldl %cl, %edx, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK31-NEXT: movl %ebx, 28(%edx) -; FALLBACK31-NEXT: movl %eax, 24(%edx) -; FALLBACK31-NEXT: shlxl %ecx, %esi, %eax -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: shldl %cl, %esi, %ebp -; FALLBACK31-NEXT: movl %ebp, 4(%edx) -; FALLBACK31-NEXT: movl %edi, 8(%edx) -; FALLBACK31-NEXT: movl (%esp), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 12(%edx) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 16(%edx) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 20(%edx) -; FALLBACK31-NEXT: movl %eax, (%edx) -; FALLBACK31-NEXT: addl $92, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r10), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r10), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r10), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r10), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %sil, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rsi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movsbq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %sil, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rsi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r9, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %ah +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ah, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %ch +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ch, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%eax), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 28(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, 92(%esp,%ecx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %cl, %dh +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %dh +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 28(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 24(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 4(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %cl, %dh +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %dh +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movsbl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%ebx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebx), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%eax), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 28(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 24(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 24(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 16(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 24(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 4(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 16(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 @@ -7880,617 +6434,472 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { } define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: shl_32bytes_dwordOff: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: movl %esi, %eax -; FALLBACK0-NEXT: shlb $5, %al -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: shlb $2, %sil -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: negb %sil -; FALLBACK0-NEXT: movsbq %sil, %r10 -; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 -; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 -; FALLBACK0-NEXT: shrq %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 -; FALLBACK0-NEXT: movq %r10, %rbx -; FALLBACK0-NEXT: shrq %rbx -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: orq %r11, %rbx -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: shrq %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) -; FALLBACK0-NEXT: movq %rbx, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: shl_32bytes_dwordOff: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: movl %esi, %ecx -; FALLBACK1-NEXT: shlb $5, %cl -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: shlb $2, %sil -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: negb %sil -; FALLBACK1-NEXT: movsbq %sil, %rax -; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: shldq %cl, %r8, %rax -; FALLBACK1-NEXT: shlq %cl, %r8 -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rdi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rax, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: shl_32bytes_dwordOff: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: movl %esi, %eax -; FALLBACK2-NEXT: shlb $5, %al -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: shlb $2, %sil -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: shl_32bytes_dwordOff: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: movl %esi, %ecx -; FALLBACK3-NEXT: shlb $5, %cl -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: shlb $2, %sil -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: negb %sil -; FALLBACK3-NEXT: movsbq %sil, %rax -; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: shldq %cl, %r8, %rax -; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rdi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rax, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: shl_32bytes_dwordOff: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movzbl (%rsi), %ecx -; FALLBACK4-NEXT: movl %ecx, %eax -; FALLBACK4-NEXT: shlb $5, %al -; FALLBACK4-NEXT: xorps %xmm2, %xmm2 -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: shlb $2, %cl -; FALLBACK4-NEXT: andb $24, %cl -; FALLBACK4-NEXT: negb %cl -; FALLBACK4-NEXT: movsbq %cl, %r8 -; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq %r10, %rdi -; FALLBACK4-NEXT: shrq %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rdi -; FALLBACK4-NEXT: orq %r9, %rdi -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK4-NEXT: movq %r8, %r11 -; FALLBACK4-NEXT: shrq %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r8 -; FALLBACK4-NEXT: movq %r9, %r10 -; FALLBACK4-NEXT: shrq %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, (%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %r11, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, 24(%rdx) -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: shl_32bytes_dwordOff: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movzbl (%rsi), %eax -; FALLBACK5-NEXT: movl %eax, %ecx -; FALLBACK5-NEXT: shlb $5, %cl -; FALLBACK5-NEXT: xorps %xmm2, %xmm2 -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: shlb $2, %al -; FALLBACK5-NEXT: andb $24, %al -; FALLBACK5-NEXT: negb %al -; FALLBACK5-NEXT: movsbq %al, %rax -; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK5-NEXT: shldq %cl, %rax, %rsi -; FALLBACK5-NEXT: movq %r8, %r9 -; FALLBACK5-NEXT: shlq %cl, %r9 -; FALLBACK5-NEXT: shldq %cl, %r8, %rax -; FALLBACK5-NEXT: movq %rax, 8(%rdx) -; FALLBACK5-NEXT: movq %rsi, 16(%rdx) -; FALLBACK5-NEXT: movq %rdi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: shl_32bytes_dwordOff: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: movl %ecx, %eax -; FALLBACK6-NEXT: shlb $5, %al -; FALLBACK6-NEXT: xorps %xmm2, %xmm2 -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: shlb $2, %cl -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: shl_32bytes_dwordOff: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movzbl (%rsi), %eax -; FALLBACK7-NEXT: movl %eax, %ecx -; FALLBACK7-NEXT: shlb $5, %cl -; FALLBACK7-NEXT: xorps %xmm2, %xmm2 -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: shlb $2, %al -; FALLBACK7-NEXT: andb $24, %al -; FALLBACK7-NEXT: negb %al -; FALLBACK7-NEXT: movsbq %al, %rax -; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK7-NEXT: shldq %cl, %rax, %rsi -; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r8, %rax -; FALLBACK7-NEXT: movq %rax, 8(%rdx) -; FALLBACK7-NEXT: movq %rsi, 16(%rdx) -; FALLBACK7-NEXT: movq %rdi, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: shl_32bytes_dwordOff: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: movzbl (%rsi), %ecx -; FALLBACK8-NEXT: movl %ecx, %eax -; FALLBACK8-NEXT: shlb $5, %al -; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: shlb $2, %cl -; FALLBACK8-NEXT: andb $24, %cl -; FALLBACK8-NEXT: negb %cl -; FALLBACK8-NEXT: movsbq %cl, %r8 -; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK8-NEXT: movq %r10, %rdi -; FALLBACK8-NEXT: shrq %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rdi -; FALLBACK8-NEXT: orq %r9, %rdi -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK8-NEXT: movq %r8, %r11 -; FALLBACK8-NEXT: shrq %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r8 -; FALLBACK8-NEXT: movq %r9, %r10 -; FALLBACK8-NEXT: shrq %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, (%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %r11, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, 24(%rdx) -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: shl_32bytes_dwordOff: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: movzbl (%rsi), %eax -; FALLBACK9-NEXT: movl %eax, %ecx -; FALLBACK9-NEXT: shlb $5, %cl -; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: shlb $2, %al -; FALLBACK9-NEXT: andb $24, %al -; FALLBACK9-NEXT: negb %al -; FALLBACK9-NEXT: movsbq %al, %rax -; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK9-NEXT: shldq %cl, %rax, %rsi -; FALLBACK9-NEXT: movq %r8, %r9 -; FALLBACK9-NEXT: shlq %cl, %r9 -; FALLBACK9-NEXT: shldq %cl, %r8, %rax -; FALLBACK9-NEXT: movq %rax, 8(%rdx) -; FALLBACK9-NEXT: movq %rsi, 16(%rdx) -; FALLBACK9-NEXT: movq %rdi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: shl_32bytes_dwordOff: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax -; FALLBACK10-NEXT: shlb $5, %al -; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: shlb $2, %cl -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: shl_32bytes_dwordOff: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: movzbl (%rsi), %eax -; FALLBACK11-NEXT: movl %eax, %ecx -; FALLBACK11-NEXT: shlb $5, %cl -; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: shlb $2, %al -; FALLBACK11-NEXT: andb $24, %al -; FALLBACK11-NEXT: negb %al -; FALLBACK11-NEXT: movsbq %al, %rax -; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK11-NEXT: shldq %cl, %rax, %rsi -; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r8, %rax -; FALLBACK11-NEXT: movq %rax, 8(%rdx) -; FALLBACK11-NEXT: movq %rsi, 16(%rdx) -; FALLBACK11-NEXT: movq %rdi, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: shl_32bytes_dwordOff: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: movzbl (%rsi), %ecx -; FALLBACK12-NEXT: movl %ecx, %eax -; FALLBACK12-NEXT: shlb $5, %al -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: shlb $2, %cl -; FALLBACK12-NEXT: andb $24, %cl -; FALLBACK12-NEXT: negb %cl -; FALLBACK12-NEXT: movsbq %cl, %r8 -; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10 -; FALLBACK12-NEXT: movq %r10, %rdi -; FALLBACK12-NEXT: shrq %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rdi -; FALLBACK12-NEXT: orq %r9, %rdi -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8 -; FALLBACK12-NEXT: movq %r8, %r11 -; FALLBACK12-NEXT: shrq %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r8 -; FALLBACK12-NEXT: movq %r9, %r10 -; FALLBACK12-NEXT: shrq %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, (%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %r11, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, 24(%rdx) -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: shl_32bytes_dwordOff: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: movzbl (%rsi), %eax -; FALLBACK13-NEXT: movl %eax, %ecx -; FALLBACK13-NEXT: shlb $5, %cl -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: shlb $2, %al -; FALLBACK13-NEXT: andb $24, %al -; FALLBACK13-NEXT: negb %al -; FALLBACK13-NEXT: movsbq %al, %rax -; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK13-NEXT: shldq %cl, %rax, %rsi -; FALLBACK13-NEXT: movq %r8, %r9 -; FALLBACK13-NEXT: shlq %cl, %r9 -; FALLBACK13-NEXT: shldq %cl, %r8, %rax -; FALLBACK13-NEXT: movq %rax, 8(%rdx) -; FALLBACK13-NEXT: movq %rsi, 16(%rdx) -; FALLBACK13-NEXT: movq %rdi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: shl_32bytes_dwordOff: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax -; FALLBACK14-NEXT: shlb $5, %al -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: shlb $2, %cl -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: shl_32bytes_dwordOff: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: movzbl (%rsi), %eax -; FALLBACK15-NEXT: movl %eax, %ecx -; FALLBACK15-NEXT: shlb $5, %cl -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: shlb $2, %al -; FALLBACK15-NEXT: andb $24, %al -; FALLBACK15-NEXT: negb %al -; FALLBACK15-NEXT: movsbq %al, %rax -; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi -; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi -; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi -; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 -; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax -; FALLBACK15-NEXT: shldq %cl, %rax, %rsi -; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r8, %rax -; FALLBACK15-NEXT: movq %rax, 8(%rdx) -; FALLBACK15-NEXT: movq %rsi, 16(%rdx) -; FALLBACK15-NEXT: movq %rdi, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $2, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r10), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r10), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r10), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r10), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $2, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $2, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $2, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $2, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $2, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $2, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %sil, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rsi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $2, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $2, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movsbq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $2, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $2, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %sil, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rsi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r9, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $2, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %al, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq ; ; X86-SSE2-LABEL: shl_32bytes_dwordOff: ; X86-SSE2: # %bb.0: @@ -8800,2193 +7209,1656 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou } define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: ashr_32bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: sarq $63, %rdi -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $24, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: sarq %cl, %r9 -; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: ashr_32bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: sarq $63, %rdi -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $24, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: ashr_32bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: sarq $63, %rdi -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: ashr_32bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: sarq $63, %rdi -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $24, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: ashr_32bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movq 16(%rdi), %rcx -; FALLBACK4-NEXT: movq 24(%rdi), %rdi -; FALLBACK4-NEXT: movzbl (%rsi), %esi -; FALLBACK4-NEXT: leal (,%rsi,8), %eax -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: sarq $63, %rdi -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $24, %sil -; FALLBACK4-NEXT: movzbl %sil, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: sarq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: ashr_32bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movq 16(%rdi), %rax -; FALLBACK5-NEXT: movq 24(%rdi), %rdi -; FALLBACK5-NEXT: movzbl (%rsi), %esi -; FALLBACK5-NEXT: leal (,%rsi,8), %ecx -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: sarq $63, %rdi -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $24, %sil -; FALLBACK5-NEXT: movzbl %sil, %eax -; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK5-NEXT: movq %rdi, %r8 -; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: sarq %cl, %rsi -; FALLBACK5-NEXT: movq %r10, 8(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: ashr_32bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movq 16(%rdi), %rcx -; FALLBACK6-NEXT: movq 24(%rdi), %rdi -; FALLBACK6-NEXT: movzbl (%rsi), %esi -; FALLBACK6-NEXT: leal (,%rsi,8), %eax -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: sarq $63, %rdi -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $24, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: ashr_32bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movq 16(%rdi), %rax -; FALLBACK7-NEXT: movq 24(%rdi), %rdi -; FALLBACK7-NEXT: movzbl (%rsi), %esi -; FALLBACK7-NEXT: leal (,%rsi,8), %ecx -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: sarq $63, %rdi -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $24, %sil -; FALLBACK7-NEXT: movzbl %sil, %eax -; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK7-NEXT: movq %rdi, %r8 -; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK7-NEXT: movq %r10, 8(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: ashr_32bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK8-NEXT: movq 16(%rdi), %rcx -; FALLBACK8-NEXT: movq 24(%rdi), %rdi -; FALLBACK8-NEXT: movzbl (%rsi), %esi -; FALLBACK8-NEXT: leal (,%rsi,8), %eax -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: sarq $63, %rdi -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $24, %sil -; FALLBACK8-NEXT: movzbl %sil, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: sarq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: ashr_32bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK9-NEXT: movq 16(%rdi), %rax -; FALLBACK9-NEXT: movq 24(%rdi), %rdi -; FALLBACK9-NEXT: movzbl (%rsi), %esi -; FALLBACK9-NEXT: leal (,%rsi,8), %ecx -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: sarq $63, %rdi -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $24, %sil -; FALLBACK9-NEXT: movzbl %sil, %eax -; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq %rdi, %r8 -; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: sarq %cl, %rsi -; FALLBACK9-NEXT: movq %r10, 8(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: ashr_32bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK10-NEXT: movq 16(%rdi), %rcx -; FALLBACK10-NEXT: movq 24(%rdi), %rdi -; FALLBACK10-NEXT: movzbl (%rsi), %esi -; FALLBACK10-NEXT: leal (,%rsi,8), %eax -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: sarq $63, %rdi -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: ashr_32bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK11-NEXT: movq 16(%rdi), %rax -; FALLBACK11-NEXT: movq 24(%rdi), %rdi -; FALLBACK11-NEXT: movzbl (%rsi), %esi -; FALLBACK11-NEXT: leal (,%rsi,8), %ecx -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: sarq $63, %rdi -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $24, %sil -; FALLBACK11-NEXT: movzbl %sil, %eax -; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq %rdi, %r8 -; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK11-NEXT: movq %r10, 8(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: ashr_32bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK12-NEXT: movq 16(%rdi), %rcx -; FALLBACK12-NEXT: movq 24(%rdi), %rdi -; FALLBACK12-NEXT: movzbl (%rsi), %esi -; FALLBACK12-NEXT: leal (,%rsi,8), %eax -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: sarq $63, %rdi -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $24, %sil -; FALLBACK12-NEXT: movzbl %sil, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: sarq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: ashr_32bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK13-NEXT: movq 16(%rdi), %rax -; FALLBACK13-NEXT: movq 24(%rdi), %rdi -; FALLBACK13-NEXT: movzbl (%rsi), %esi -; FALLBACK13-NEXT: leal (,%rsi,8), %ecx -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: sarq $63, %rdi -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $24, %sil -; FALLBACK13-NEXT: movzbl %sil, %eax -; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK13-NEXT: movq %rdi, %r8 -; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: sarq %cl, %rsi -; FALLBACK13-NEXT: movq %r10, 8(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: ashr_32bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK14-NEXT: movq 16(%rdi), %rcx -; FALLBACK14-NEXT: movq 24(%rdi), %rdi -; FALLBACK14-NEXT: movzbl (%rsi), %esi -; FALLBACK14-NEXT: leal (,%rsi,8), %eax -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: sarq $63, %rdi -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: ashr_32bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK15-NEXT: movq 16(%rdi), %rax -; FALLBACK15-NEXT: movq 24(%rdi), %rdi -; FALLBACK15-NEXT: movzbl (%rsi), %esi -; FALLBACK15-NEXT: leal (,%rsi,8), %ecx -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: sarq $63, %rdi -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $24, %sil -; FALLBACK15-NEXT: movzbl %sil, %eax -; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi -; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq %rdi, %r8 -; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK15-NEXT: movq %r10, 8(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: ashr_32bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $108, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK16-NEXT: movl (%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%esi), %ebx -; FALLBACK16-NEXT: movl 12(%esi), %ebp -; FALLBACK16-NEXT: movl 16(%esi), %edi -; FALLBACK16-NEXT: movzbl (%eax), %ecx -; FALLBACK16-NEXT: movl 20(%esi), %edx -; FALLBACK16-NEXT: movl 24(%esi), %eax -; FALLBACK16-NEXT: movl 28(%esi), %esi -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, %edx -; FALLBACK16-NEXT: shlb $3, %dl -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: sarl $31, %esi -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: andb $28, %cl -; FALLBACK16-NEXT: movzbl %cl, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi -; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax -; FALLBACK16-NEXT: movl %eax, %ebx -; FALLBACK16-NEXT: movl %edx, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movb %dl, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %eax, %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl %edx, %ebx -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: movl 48(%esp,%eax), %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%edx,%edx), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %esi, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebx, %edx -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK16-NEXT: movl 52(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, %eax -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 56(%esp,%esi), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %eax, %edi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%esp,%eax), %eax -; FALLBACK16-NEXT: leal (%eax,%eax), %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; FALLBACK16-NEXT: sarl %cl, %eax -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl %eax, 28(%ecx) -; FALLBACK16-NEXT: movl %edx, 24(%ecx) -; FALLBACK16-NEXT: movl %edi, 16(%ecx) -; FALLBACK16-NEXT: movl %esi, 20(%ecx) -; FALLBACK16-NEXT: movl %ebp, 8(%ecx) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, 12(%ecx) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, (%ecx) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, 4(%ecx) -; FALLBACK16-NEXT: addl $108, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: ashr_32bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $92, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %edx -; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %ebp -; FALLBACK17-NEXT: movl 16(%ecx), %ebx -; FALLBACK17-NEXT: movzbl (%eax), %eax -; FALLBACK17-NEXT: movl 20(%ecx), %edi -; FALLBACK17-NEXT: movl 24(%ecx), %edx -; FALLBACK17-NEXT: movl 28(%ecx), %esi -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, %ecx -; FALLBACK17-NEXT: shlb $3, %cl -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: sarl $31, %esi -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: andb $28, %al -; FALLBACK17-NEXT: movzbl %al, %ebp -; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi -; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %esi -; FALLBACK17-NEXT: sarl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl %ebx, 16(%ebp) -; FALLBACK17-NEXT: movl %edi, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %esi, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $92, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: ashr_32bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl (%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%esi), %ebx -; FALLBACK18-NEXT: movl 12(%esi), %ebp -; FALLBACK18-NEXT: movl 16(%esi), %edi -; FALLBACK18-NEXT: movzbl (%ecx), %ecx -; FALLBACK18-NEXT: movl 20(%esi), %edx -; FALLBACK18-NEXT: movl 24(%esi), %eax -; FALLBACK18-NEXT: movl 28(%esi), %esi -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shlb $3, %al -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: sarl $31, %esi -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %cl -; FALLBACK18-NEXT: movzbl %cl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp -; FALLBACK18-NEXT: orl %ebx, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%esi,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) -; FALLBACK18-NEXT: addl $108, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: ashr_32bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $92, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ecx), %ebp -; FALLBACK19-NEXT: movl 16(%ecx), %ebx -; FALLBACK19-NEXT: movzbl (%eax), %eax -; FALLBACK19-NEXT: movl 20(%ecx), %edi -; FALLBACK19-NEXT: movl 24(%ecx), %edx -; FALLBACK19-NEXT: movl 28(%ecx), %esi -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, %ecx -; FALLBACK19-NEXT: shlb $3, %cl -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: sarl $31, %esi -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: andb $28, %al -; FALLBACK19-NEXT: movzbl %al, %ebp -; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %esi, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl %edx, %esi -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi -; FALLBACK19-NEXT: shrdl %cl, %edi, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: sarxl %ecx, %edi, %eax -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl %ebx, 16(%ebp) -; FALLBACK19-NEXT: movl %esi, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, (%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 4(%ebp) -; FALLBACK19-NEXT: addl $92, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: ashr_32bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $108, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movl 16(%ecx), %esi -; FALLBACK20-NEXT: movl 20(%ecx), %edi -; FALLBACK20-NEXT: movl 24(%ecx), %ebx -; FALLBACK20-NEXT: movl 28(%ecx), %edx -; FALLBACK20-NEXT: movzbl (%eax), %eax -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shlb $3, %cl -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: sarl $31, %edx -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: andb $28, %al -; FALLBACK20-NEXT: movzbl %al, %edi -; FALLBACK20-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK20-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl %ecx, %edx -; FALLBACK20-NEXT: movb %cl, %dh -; FALLBACK20-NEXT: notb %dl -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %eax, %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %eax, %esi -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK20-NEXT: movl %esi, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK20-NEXT: movl %ebp, %eax -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebp, %ebp -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %eax, %ebp -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %edi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movl %edx, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dh, %cl -; FALLBACK20-NEXT: sarl %cl, %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movl %eax, 28(%ecx) -; FALLBACK20-NEXT: movl %esi, 4(%ecx) -; FALLBACK20-NEXT: movl %edi, 24(%ecx) -; FALLBACK20-NEXT: movl %ebp, 16(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 20(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 8(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, 12(%ecx) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movl %eax, (%ecx) -; FALLBACK20-NEXT: addl $108, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: ashr_32bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $108, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movl 16(%ecx), %esi -; FALLBACK21-NEXT: movl 20(%ecx), %edi -; FALLBACK21-NEXT: movl 24(%ecx), %ebx -; FALLBACK21-NEXT: movl 28(%ecx), %edx -; FALLBACK21-NEXT: movzbl (%eax), %eax -; FALLBACK21-NEXT: movl %eax, %ecx -; FALLBACK21-NEXT: shlb $3, %cl -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: sarl $31, %edx -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: andb $28, %al -; FALLBACK21-NEXT: movzbl %al, %ebp -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl %edi, %esi -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %esi, 4(%ebp) -; FALLBACK21-NEXT: movl %ebx, 24(%ebp) -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: sarl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %edx, (%ebp) -; FALLBACK21-NEXT: addl $108, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: ashr_32bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $108, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movl 16(%ecx), %esi -; FALLBACK22-NEXT: movl 20(%ecx), %edi -; FALLBACK22-NEXT: movl 24(%ecx), %ebx -; FALLBACK22-NEXT: movl 28(%ecx), %edx -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: sarl $31, %edx -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %eax, %edx -; FALLBACK22-NEXT: notb %dl -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: sarxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 8(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%edx) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, (%edx) -; FALLBACK22-NEXT: addl $108, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: ashr_32bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $108, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movl 16(%ecx), %esi -; FALLBACK23-NEXT: movl 20(%ecx), %edi -; FALLBACK23-NEXT: movl 24(%ecx), %ebx -; FALLBACK23-NEXT: movl 28(%ecx), %edx -; FALLBACK23-NEXT: movzbl (%eax), %eax -; FALLBACK23-NEXT: movl %eax, %ecx -; FALLBACK23-NEXT: shlb $3, %cl -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: sarl $31, %edx -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: andb $28, %al -; FALLBACK23-NEXT: movzbl %al, %ebx -; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, %edi -; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %ebx, 4(%eax) -; FALLBACK23-NEXT: movl %ebp, 24(%eax) -; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK23-NEXT: movl %ebx, 28(%eax) -; FALLBACK23-NEXT: movl %esi, 16(%eax) -; FALLBACK23-NEXT: movl %edi, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: movl %esi, 12(%eax) -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, (%eax) -; FALLBACK23-NEXT: addl $108, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: ashr_32bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $108, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK24-NEXT: movl 16(%ecx), %esi -; FALLBACK24-NEXT: movl 20(%ecx), %edi -; FALLBACK24-NEXT: movl 24(%ecx), %ebx -; FALLBACK24-NEXT: movl 28(%ecx), %edx -; FALLBACK24-NEXT: movzbl (%eax), %eax -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shlb $3, %cl -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: sarl $31, %edx -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: andb $28, %al -; FALLBACK24-NEXT: movzbl %al, %edi -; FALLBACK24-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK24-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl %ecx, %edx -; FALLBACK24-NEXT: movb %cl, %dh -; FALLBACK24-NEXT: notb %dl -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %eax, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %eax, %esi -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK24-NEXT: movl %esi, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebp, %ebp -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %eax, %ebp -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %edi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movl %edx, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dh, %cl -; FALLBACK24-NEXT: sarl %cl, %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: movl %eax, 28(%ecx) -; FALLBACK24-NEXT: movl %esi, 4(%ecx) -; FALLBACK24-NEXT: movl %edi, 24(%ecx) -; FALLBACK24-NEXT: movl %ebp, 16(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 20(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 8(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, 12(%ecx) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movl %eax, (%ecx) -; FALLBACK24-NEXT: addl $108, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: ashr_32bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $108, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK25-NEXT: movl 16(%ecx), %esi -; FALLBACK25-NEXT: movl 20(%ecx), %edi -; FALLBACK25-NEXT: movl 24(%ecx), %ebx -; FALLBACK25-NEXT: movl 28(%ecx), %edx -; FALLBACK25-NEXT: movzbl (%eax), %eax -; FALLBACK25-NEXT: movl %eax, %ecx -; FALLBACK25-NEXT: shlb $3, %cl -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: sarl $31, %edx -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: andb $28, %al -; FALLBACK25-NEXT: movzbl %al, %ebp -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl %edi, %esi -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %esi, 4(%ebp) -; FALLBACK25-NEXT: movl %ebx, 24(%ebp) -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: sarl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %edx, (%ebp) -; FALLBACK25-NEXT: addl $108, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: ashr_32bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $108, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movl 16(%ecx), %esi -; FALLBACK26-NEXT: movl 20(%ecx), %edi -; FALLBACK26-NEXT: movl 24(%ecx), %ebx -; FALLBACK26-NEXT: movl 28(%ecx), %edx -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: sarl $31, %edx -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %eax, %edx -; FALLBACK26-NEXT: notb %dl -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: sarxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 8(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, (%edx) -; FALLBACK26-NEXT: addl $108, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: ashr_32bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $108, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK27-NEXT: movl 16(%ecx), %esi -; FALLBACK27-NEXT: movl 20(%ecx), %edi -; FALLBACK27-NEXT: movl 24(%ecx), %ebx -; FALLBACK27-NEXT: movl 28(%ecx), %edx -; FALLBACK27-NEXT: movzbl (%eax), %eax -; FALLBACK27-NEXT: movl %eax, %ecx -; FALLBACK27-NEXT: shlb $3, %cl -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: sarl $31, %edx -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: andb $28, %al -; FALLBACK27-NEXT: movzbl %al, %ebx -; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, %edi -; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %ebx, 4(%eax) -; FALLBACK27-NEXT: movl %ebp, 24(%eax) -; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK27-NEXT: movl %ebx, 28(%eax) -; FALLBACK27-NEXT: movl %esi, 16(%eax) -; FALLBACK27-NEXT: movl %edi, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: movl %esi, 12(%eax) -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, (%eax) -; FALLBACK27-NEXT: addl $108, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: ashr_32bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $108, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK28-NEXT: movl 16(%ecx), %esi -; FALLBACK28-NEXT: movl 20(%ecx), %edi -; FALLBACK28-NEXT: movl 24(%ecx), %ebx -; FALLBACK28-NEXT: movl 28(%ecx), %edx -; FALLBACK28-NEXT: movzbl (%eax), %eax -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shlb $3, %cl -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: sarl $31, %edx -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: andb $28, %al -; FALLBACK28-NEXT: movzbl %al, %edi -; FALLBACK28-NEXT: movl 32(%esp,%edi), %eax -; FALLBACK28-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl %ecx, %edx -; FALLBACK28-NEXT: movb %cl, %dh -; FALLBACK28-NEXT: notb %dl -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %eax, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %eax, %esi -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi -; FALLBACK28-NEXT: movl %esi, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebp, %ebp -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %eax, %ebp -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 60(%esp,%edi), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %edi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movl %edx, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dh, %cl -; FALLBACK28-NEXT: sarl %cl, %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: movl %eax, 28(%ecx) -; FALLBACK28-NEXT: movl %esi, 4(%ecx) -; FALLBACK28-NEXT: movl %edi, 24(%ecx) -; FALLBACK28-NEXT: movl %ebp, 16(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 20(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 8(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, 12(%ecx) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movl %eax, (%ecx) -; FALLBACK28-NEXT: addl $108, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: ashr_32bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $108, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK29-NEXT: movl 16(%ecx), %esi -; FALLBACK29-NEXT: movl 20(%ecx), %edi -; FALLBACK29-NEXT: movl 24(%ecx), %ebx -; FALLBACK29-NEXT: movl 28(%ecx), %edx -; FALLBACK29-NEXT: movzbl (%eax), %eax -; FALLBACK29-NEXT: movl %eax, %ecx -; FALLBACK29-NEXT: shlb $3, %cl -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: sarl $31, %edx -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: andb $28, %al -; FALLBACK29-NEXT: movzbl %al, %ebp -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl %edi, %esi -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %esi, 4(%ebp) -; FALLBACK29-NEXT: movl %ebx, 24(%ebp) -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: sarl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %edx, (%ebp) -; FALLBACK29-NEXT: addl $108, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: ashr_32bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $108, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movl 16(%ecx), %esi -; FALLBACK30-NEXT: movl 20(%ecx), %edi -; FALLBACK30-NEXT: movl 24(%ecx), %ebx -; FALLBACK30-NEXT: movl 28(%ecx), %edx -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: sarl $31, %edx -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %eax, %edx -; FALLBACK30-NEXT: notb %dl -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: sarxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 8(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%edx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, (%edx) -; FALLBACK30-NEXT: addl $108, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: ashr_32bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $108, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK31-NEXT: movl 16(%ecx), %esi -; FALLBACK31-NEXT: movl 20(%ecx), %edi -; FALLBACK31-NEXT: movl 24(%ecx), %ebx -; FALLBACK31-NEXT: movl 28(%ecx), %edx -; FALLBACK31-NEXT: movzbl (%eax), %eax -; FALLBACK31-NEXT: movl %eax, %ecx -; FALLBACK31-NEXT: shlb $3, %cl -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: sarl $31, %edx -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: andb $28, %al -; FALLBACK31-NEXT: movzbl %al, %ebx -; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi -; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp -; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, %edi -; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp -; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx -; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %ebx, 4(%eax) -; FALLBACK31-NEXT: movl %ebp, 24(%eax) -; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; FALLBACK31-NEXT: movl %ebx, 28(%eax) -; FALLBACK31-NEXT: movl %esi, 16(%eax) -; FALLBACK31-NEXT: movl %edi, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: movl %esi, 12(%eax) -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, (%eax) -; FALLBACK31-NEXT: addl $108, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rsi,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %sil +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rsi,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%esi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%edi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 20(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%edx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %dl, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%ebp,%ebp), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ebp, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 20(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edi) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ecx, %edi, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %cl, %dh +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 24(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 16(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%ecx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %bl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 16(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 20(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 24(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 28(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %cl, %dh +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%edi), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%edi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 24(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 16(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 16(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 24(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 28(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%ecx), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %bl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebp, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebp, %edx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 32(%esp,%ebx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 @@ -10996,663 +8868,500 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { } define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: ashr_32bytes_dwordOff: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rcx -; FALLBACK0-NEXT: movq 8(%rdi), %r8 -; FALLBACK0-NEXT: movq 16(%rdi), %r9 -; FALLBACK0-NEXT: movq 24(%rdi), %rdi -; FALLBACK0-NEXT: movzbl (%rsi), %esi -; FALLBACK0-NEXT: movl %esi, %eax -; FALLBACK0-NEXT: shlb $5, %al -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: sarq $63, %rdi -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: andb $6, %sil -; FALLBACK0-NEXT: movzbl %sil, %r9d -; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi -; FALLBACK0-NEXT: movq %rdi, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r11, %r8 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %rdi, %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rdi -; FALLBACK0-NEXT: orq %r10, %rdi -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: sarq %cl, %r9 -; FALLBACK0-NEXT: movq %r9, 24(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %rdi, (%rdx) -; FALLBACK0-NEXT: movq %r8, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: ashr_32bytes_dwordOff: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %rdi -; FALLBACK1-NEXT: movzbl (%rsi), %esi -; FALLBACK1-NEXT: movl %esi, %ecx -; FALLBACK1-NEXT: shlb $5, %cl -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: sarq $63, %rdi -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: andb $6, %sil -; FALLBACK1-NEXT: movzbl %sil, %eax -; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK1-NEXT: movq %r8, %r9 -; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %rsi, 16(%rdx) -; FALLBACK1-NEXT: movq %rax, 24(%rdx) -; FALLBACK1-NEXT: movq %rdi, (%rdx) -; FALLBACK1-NEXT: movq %r9, 8(%rdx) -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: ashr_32bytes_dwordOff: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %rdi -; FALLBACK2-NEXT: movzbl (%rsi), %esi -; FALLBACK2-NEXT: movl %esi, %eax -; FALLBACK2-NEXT: shlb $5, %al -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: sarq $63, %rdi -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK2-NEXT: notb %al -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) -; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: ashr_32bytes_dwordOff: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %rdi -; FALLBACK3-NEXT: movzbl (%rsi), %esi -; FALLBACK3-NEXT: movl %esi, %ecx -; FALLBACK3-NEXT: shlb $5, %cl -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: sarq $63, %rdi -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: andb $6, %sil -; FALLBACK3-NEXT: movzbl %sil, %eax -; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi -; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi -; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 -; FALLBACK3-NEXT: movq %r8, %r9 -; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 -; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi -; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: movq %rsi, 16(%rdx) -; FALLBACK3-NEXT: movq %rax, 24(%rdx) -; FALLBACK3-NEXT: movq %rdi, (%rdx) -; FALLBACK3-NEXT: movq %r9, 8(%rdx) -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: ashr_32bytes_dwordOff: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movq 16(%rdi), %rcx -; FALLBACK4-NEXT: movq 24(%rdi), %rdi -; FALLBACK4-NEXT: movzbl (%rsi), %esi -; FALLBACK4-NEXT: movl %esi, %eax -; FALLBACK4-NEXT: shlb $5, %al -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: sarq $63, %rdi -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: andb $6, %sil -; FALLBACK4-NEXT: movzbl %sil, %r9d -; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK4-NEXT: movq %r10, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r11, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r8, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: sarq %cl, %r9 -; FALLBACK4-NEXT: movq %r9, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: ashr_32bytes_dwordOff: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movq 16(%rdi), %rax -; FALLBACK5-NEXT: movq 24(%rdi), %rdi -; FALLBACK5-NEXT: movzbl (%rsi), %esi -; FALLBACK5-NEXT: movl %esi, %ecx -; FALLBACK5-NEXT: shlb $5, %cl -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: sarq $63, %rdi -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: andb $6, %sil -; FALLBACK5-NEXT: movzbl %sil, %eax -; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK5-NEXT: movq %rdi, %r8 -; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK5-NEXT: movq %rax, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK5-NEXT: sarq %cl, %rsi -; FALLBACK5-NEXT: movq %r10, 8(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: ashr_32bytes_dwordOff: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movq 16(%rdi), %rcx -; FALLBACK6-NEXT: movq 24(%rdi), %rdi -; FALLBACK6-NEXT: movzbl (%rsi), %esi -; FALLBACK6-NEXT: movl %esi, %eax -; FALLBACK6-NEXT: shlb $5, %al -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: sarq $63, %rdi -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $6, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: ashr_32bytes_dwordOff: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movq 16(%rdi), %rax -; FALLBACK7-NEXT: movq 24(%rdi), %rdi -; FALLBACK7-NEXT: movzbl (%rsi), %esi -; FALLBACK7-NEXT: movl %esi, %ecx -; FALLBACK7-NEXT: shlb $5, %cl -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: sarq $63, %rdi -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: andb $6, %sil -; FALLBACK7-NEXT: movzbl %sil, %eax -; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK7-NEXT: movq %rdi, %r8 -; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK7-NEXT: movq %rax, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK7-NEXT: movq %r10, 8(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rax, 24(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: ashr_32bytes_dwordOff: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK8-NEXT: movq 16(%rdi), %rcx -; FALLBACK8-NEXT: movq 24(%rdi), %rdi -; FALLBACK8-NEXT: movzbl (%rsi), %esi -; FALLBACK8-NEXT: movl %esi, %eax -; FALLBACK8-NEXT: shlb $5, %al -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: sarq $63, %rdi -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: andb $6, %sil -; FALLBACK8-NEXT: movzbl %sil, %r9d -; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK8-NEXT: movq %r10, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r11, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r8, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: sarq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: ashr_32bytes_dwordOff: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK9-NEXT: movq 16(%rdi), %rax -; FALLBACK9-NEXT: movq 24(%rdi), %rdi -; FALLBACK9-NEXT: movzbl (%rsi), %esi -; FALLBACK9-NEXT: movl %esi, %ecx -; FALLBACK9-NEXT: shlb $5, %cl -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: sarq $63, %rdi -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: andb $6, %sil -; FALLBACK9-NEXT: movzbl %sil, %eax -; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK9-NEXT: movq %rdi, %r8 -; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK9-NEXT: movq %rax, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK9-NEXT: sarq %cl, %rsi -; FALLBACK9-NEXT: movq %r10, 8(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: ashr_32bytes_dwordOff: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK10-NEXT: movq 16(%rdi), %rcx -; FALLBACK10-NEXT: movq 24(%rdi), %rdi -; FALLBACK10-NEXT: movzbl (%rsi), %esi -; FALLBACK10-NEXT: movl %esi, %eax -; FALLBACK10-NEXT: shlb $5, %al -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: sarq $63, %rdi -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $6, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: ashr_32bytes_dwordOff: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK11-NEXT: movq 16(%rdi), %rax -; FALLBACK11-NEXT: movq 24(%rdi), %rdi -; FALLBACK11-NEXT: movzbl (%rsi), %esi -; FALLBACK11-NEXT: movl %esi, %ecx -; FALLBACK11-NEXT: shlb $5, %cl -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: sarq $63, %rdi -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: andb $6, %sil -; FALLBACK11-NEXT: movzbl %sil, %eax -; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK11-NEXT: movq %rdi, %r8 -; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK11-NEXT: movq %rax, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK11-NEXT: movq %r10, 8(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rax, 24(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: ashr_32bytes_dwordOff: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK12-NEXT: movq 16(%rdi), %rcx -; FALLBACK12-NEXT: movq 24(%rdi), %rdi -; FALLBACK12-NEXT: movzbl (%rsi), %esi -; FALLBACK12-NEXT: movl %esi, %eax -; FALLBACK12-NEXT: shlb $5, %al -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: sarq $63, %rdi -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: andb $6, %sil -; FALLBACK12-NEXT: movzbl %sil, %r9d -; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 -; FALLBACK12-NEXT: movq %r10, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r11, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r8, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: sarq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: ashr_32bytes_dwordOff: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK13-NEXT: movq 16(%rdi), %rax -; FALLBACK13-NEXT: movq 24(%rdi), %rdi -; FALLBACK13-NEXT: movzbl (%rsi), %esi -; FALLBACK13-NEXT: movl %esi, %ecx -; FALLBACK13-NEXT: shlb $5, %cl -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: sarq $63, %rdi -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: andb $6, %sil -; FALLBACK13-NEXT: movzbl %sil, %eax -; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK13-NEXT: movq %rdi, %r8 -; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK13-NEXT: movq %rax, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK13-NEXT: sarq %cl, %rsi -; FALLBACK13-NEXT: movq %r10, 8(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: ashr_32bytes_dwordOff: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK14-NEXT: movq 16(%rdi), %rcx -; FALLBACK14-NEXT: movq 24(%rdi), %rdi -; FALLBACK14-NEXT: movzbl (%rsi), %esi -; FALLBACK14-NEXT: movl %esi, %eax -; FALLBACK14-NEXT: shlb $5, %al -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: sarq $63, %rdi -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $6, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: ashr_32bytes_dwordOff: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: vmovups (%rdi), %xmm0 -; FALLBACK15-NEXT: movq 16(%rdi), %rax -; FALLBACK15-NEXT: movq 24(%rdi), %rdi -; FALLBACK15-NEXT: movzbl (%rsi), %esi -; FALLBACK15-NEXT: movl %esi, %ecx -; FALLBACK15-NEXT: shlb $5, %cl -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: sarq $63, %rdi -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: andb $6, %sil -; FALLBACK15-NEXT: movzbl %sil, %eax -; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi -; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi -; FALLBACK15-NEXT: movq %rdi, %r8 -; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 -; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 -; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax -; FALLBACK15-NEXT: movq %rax, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 -; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 -; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax -; FALLBACK15-NEXT: movq %r10, 8(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rax, 24(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: retq +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9,4), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9,4), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9,4), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %sil +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %sil +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi,4), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rsi,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %sil +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $6, %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %r9d +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9,4), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9,4), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9,4), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $6, %sil +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %sil +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rsi,4), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rsi,4), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rsi,4), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rsi,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %sil +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq ; ; X86-SSE2-LABEL: ashr_32bytes_dwordOff: ; X86-SSE2: # %bb.0: @@ -12035,3644 +9744,3629 @@ define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no } define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: lshr_64bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %r15 -; FALLBACK0-NEXT: pushq %r14 -; FALLBACK0-NEXT: pushq %r13 -; FALLBACK0-NEXT: pushq %r12 -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %r14 -; FALLBACK0-NEXT: movl (%rsi), %edi -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: leal (,%rdi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax -; FALLBACK0-NEXT: andl $56, %edi -; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 -; FALLBACK0-NEXT: movq %r8, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %r8, %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r10, %r8 -; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq %r10, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 -; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: orq %r15, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: addq %r10, %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: movq %rbx, %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r12 -; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 -; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: addq %rbx, %rbx -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: orq %r14, %rbx -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: orq %r13, %r14 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: movq %rdi, 56(%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %rbx, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %r11, 24(%rdx) -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: popq %r12 -; FALLBACK0-NEXT: popq %r13 -; FALLBACK0-NEXT: popq %r14 -; FALLBACK0-NEXT: popq %r15 -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: lshr_64bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: pushq %r15 -; FALLBACK1-NEXT: pushq %r14 -; FALLBACK1-NEXT: pushq %rbx -; FALLBACK1-NEXT: movq (%rdi), %rcx -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %r10 -; FALLBACK1-NEXT: movq 32(%rdi), %r11 -; FALLBACK1-NEXT: movq 40(%rdi), %rbx -; FALLBACK1-NEXT: movq 48(%rdi), %r14 -; FALLBACK1-NEXT: movq 56(%rdi), %rdi -; FALLBACK1-NEXT: movl (%rsi), %eax -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: leal (,%rax,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx -; FALLBACK1-NEXT: andl $56, %eax -; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK1-NEXT: movq %r9, %r8 -; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq %r11, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK1-NEXT: movq %r14, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shrq %cl, %rax -; FALLBACK1-NEXT: movq %r11, 48(%rdx) -; FALLBACK1-NEXT: movq %rax, 56(%rdx) -; FALLBACK1-NEXT: movq %r10, 32(%rdx) -; FALLBACK1-NEXT: movq %r15, 40(%rdx) -; FALLBACK1-NEXT: movq %rdi, 16(%rdx) -; FALLBACK1-NEXT: movq %rbx, 24(%rdx) -; FALLBACK1-NEXT: movq %rsi, (%rdx) -; FALLBACK1-NEXT: movq %r8, 8(%rdx) -; FALLBACK1-NEXT: popq %rbx -; FALLBACK1-NEXT: popq %r14 -; FALLBACK1-NEXT: popq %r15 -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: lshr_64bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp -; FALLBACK2-NEXT: pushq %r15 -; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 -; FALLBACK2-NEXT: pushq %r12 -; FALLBACK2-NEXT: pushq %rbx -; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %r10 -; FALLBACK2-NEXT: movq 32(%rdi), %r11 -; FALLBACK2-NEXT: movq 40(%rdi), %rbx -; FALLBACK2-NEXT: movq 48(%rdi), %r14 -; FALLBACK2-NEXT: movq 56(%rdi), %rdi -; FALLBACK2-NEXT: movl (%rsi), %eax -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rax,8), %ecx -; FALLBACK2-NEXT: andl $56, %ecx -; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 -; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx -; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) -; FALLBACK2-NEXT: addq $8, %rsp -; FALLBACK2-NEXT: popq %rbx -; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 -; FALLBACK2-NEXT: popq %r14 -; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: lshr_64bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: pushq %r15 -; FALLBACK3-NEXT: pushq %r14 -; FALLBACK3-NEXT: pushq %rbx -; FALLBACK3-NEXT: movq (%rdi), %rcx -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %r10 -; FALLBACK3-NEXT: movq 32(%rdi), %r11 -; FALLBACK3-NEXT: movq 40(%rdi), %rbx -; FALLBACK3-NEXT: movq 48(%rdi), %r14 -; FALLBACK3-NEXT: movq 56(%rdi), %rdi -; FALLBACK3-NEXT: movl (%rsi), %eax -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: leal (,%rax,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx -; FALLBACK3-NEXT: andl $56, %eax -; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK3-NEXT: movq %r9, %r8 -; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq %r11, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK3-NEXT: movq %r14, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK3-NEXT: movq %r11, 48(%rdx) -; FALLBACK3-NEXT: movq %r10, 32(%rdx) -; FALLBACK3-NEXT: movq %r15, 40(%rdx) -; FALLBACK3-NEXT: movq %rdi, 16(%rdx) -; FALLBACK3-NEXT: movq %rbx, 24(%rdx) -; FALLBACK3-NEXT: movq %rsi, (%rdx) -; FALLBACK3-NEXT: movq %r8, 8(%rdx) -; FALLBACK3-NEXT: movq %rax, 56(%rdx) -; FALLBACK3-NEXT: popq %rbx -; FALLBACK3-NEXT: popq %r14 -; FALLBACK3-NEXT: popq %r15 -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: lshr_64bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbp -; FALLBACK4-NEXT: pushq %r15 -; FALLBACK4-NEXT: pushq %r14 -; FALLBACK4-NEXT: pushq %r13 -; FALLBACK4-NEXT: pushq %r12 -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: pushq %rax -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK4-NEXT: movl (%rsi), %r8d -; FALLBACK4-NEXT: xorps %xmm4, %xmm4 -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: leal (,%r8,8), %eax -; FALLBACK4-NEXT: andl $56, %eax -; FALLBACK4-NEXT: andl $56, %r8d -; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rdi -; FALLBACK4-NEXT: orq %r10, %rdi -; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10 -; FALLBACK4-NEXT: movq %r10, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12 -; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 -; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx -; FALLBACK4-NEXT: movq %rbx, %r14 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r14 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r14, %r10 -; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14 -; FALLBACK4-NEXT: movq %r14, %r13 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r13 -; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp -; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: orq %r13, %r15 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: addq %r14, %r14 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: orq %r12, %r14 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbp -; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8 -; FALLBACK4-NEXT: leaq (%r8,%r8), %r12 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: orq %rbp, %r12 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 -; FALLBACK4-NEXT: addq %rbx, %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r9, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: movq %r8, 56(%rdx) -; FALLBACK4-NEXT: movq %rbx, 8(%rdx) -; FALLBACK4-NEXT: movq %r12, 48(%rdx) -; FALLBACK4-NEXT: movq %r14, 32(%rdx) -; FALLBACK4-NEXT: movq %r15, 40(%rdx) -; FALLBACK4-NEXT: movq %r10, 16(%rdx) -; FALLBACK4-NEXT: movq %r11, 24(%rdx) -; FALLBACK4-NEXT: movq %rdi, (%rdx) -; FALLBACK4-NEXT: addq $8, %rsp -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: popq %r12 -; FALLBACK4-NEXT: popq %r13 -; FALLBACK4-NEXT: popq %r14 -; FALLBACK4-NEXT: popq %r15 -; FALLBACK4-NEXT: popq %rbp -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: lshr_64bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: pushq %r15 -; FALLBACK5-NEXT: pushq %r14 -; FALLBACK5-NEXT: pushq %rbx -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK5-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK5-NEXT: movl (%rsi), %eax -; FALLBACK5-NEXT: xorps %xmm4, %xmm4 -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx -; FALLBACK5-NEXT: andl $56, %eax -; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq %r9, %rsi -; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK5-NEXT: movq %r10, %r8 -; FALLBACK5-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK5-NEXT: movq %r11, %rbx -; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r15 -; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shrq %cl, %r11 -; FALLBACK5-NEXT: movq %r15, 8(%rdx) -; FALLBACK5-NEXT: movq %r9, 48(%rdx) -; FALLBACK5-NEXT: movq %r11, 56(%rdx) -; FALLBACK5-NEXT: movq %rdi, 32(%rdx) -; FALLBACK5-NEXT: movq %rbx, 40(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r14, (%rdx) -; FALLBACK5-NEXT: popq %rbx -; FALLBACK5-NEXT: popq %r14 -; FALLBACK5-NEXT: popq %r15 -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: lshr_64bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp -; FALLBACK6-NEXT: pushq %r15 -; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 -; FALLBACK6-NEXT: pushq %r12 -; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK6-NEXT: movl (%rsi), %eax -; FALLBACK6-NEXT: xorps %xmm4, %xmm4 -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi -; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 -; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) -; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp -; FALLBACK6-NEXT: popq %rbx -; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 -; FALLBACK6-NEXT: popq %r14 -; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: lshr_64bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: pushq %r15 -; FALLBACK7-NEXT: pushq %r14 -; FALLBACK7-NEXT: pushq %rbx -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK7-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK7-NEXT: movl (%rsi), %eax -; FALLBACK7-NEXT: xorps %xmm4, %xmm4 -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx -; FALLBACK7-NEXT: andl $56, %eax -; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq %r9, %rsi -; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK7-NEXT: movq %r10, %r8 -; FALLBACK7-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK7-NEXT: movq %r11, %rbx -; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r15 -; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK7-NEXT: movq %r15, 8(%rdx) -; FALLBACK7-NEXT: movq %r9, 48(%rdx) -; FALLBACK7-NEXT: movq %rdi, 32(%rdx) -; FALLBACK7-NEXT: movq %rbx, 40(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rsi, 24(%rdx) -; FALLBACK7-NEXT: movq %r14, (%rdx) -; FALLBACK7-NEXT: movq %r10, 56(%rdx) -; FALLBACK7-NEXT: popq %rbx -; FALLBACK7-NEXT: popq %r14 -; FALLBACK7-NEXT: popq %r15 -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: lshr_64bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbp -; FALLBACK8-NEXT: pushq %r15 -; FALLBACK8-NEXT: pushq %r14 -; FALLBACK8-NEXT: pushq %r13 -; FALLBACK8-NEXT: pushq %r12 -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: pushq %rax -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK8-NEXT: movl (%rsi), %r9d -; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: leal (,%r9,8), %eax -; FALLBACK8-NEXT: andl $56, %eax -; FALLBACK8-NEXT: andl $56, %r9d -; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rdi -; FALLBACK8-NEXT: orq %r10, %rdi -; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK8-NEXT: movq %r10, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12 -; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 -; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx -; FALLBACK8-NEXT: movq %rbx, %r14 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r14 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r14, %r10 -; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14 -; FALLBACK8-NEXT: movq %r14, %r13 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r13 -; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp -; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: orq %r13, %r15 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: addq %r14, %r14 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: orq %r12, %r14 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbp -; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK8-NEXT: leaq (%r9,%r9), %r12 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: orq %rbp, %r12 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: addq %rbx, %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r8, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: movq %r9, 56(%rdx) -; FALLBACK8-NEXT: movq %rbx, 8(%rdx) -; FALLBACK8-NEXT: movq %r12, 48(%rdx) -; FALLBACK8-NEXT: movq %r14, 32(%rdx) -; FALLBACK8-NEXT: movq %r15, 40(%rdx) -; FALLBACK8-NEXT: movq %r10, 16(%rdx) -; FALLBACK8-NEXT: movq %r11, 24(%rdx) -; FALLBACK8-NEXT: movq %rdi, (%rdx) -; FALLBACK8-NEXT: addq $8, %rsp -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: popq %r12 -; FALLBACK8-NEXT: popq %r13 -; FALLBACK8-NEXT: popq %r14 -; FALLBACK8-NEXT: popq %r15 -; FALLBACK8-NEXT: popq %rbp -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: lshr_64bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: pushq %r15 -; FALLBACK9-NEXT: pushq %r14 -; FALLBACK9-NEXT: pushq %rbx -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax -; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK9-NEXT: movq %r10, %r8 -; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK9-NEXT: movq %r11, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 -; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shrq %cl, %r11 -; FALLBACK9-NEXT: movq %r15, 8(%rdx) -; FALLBACK9-NEXT: movq %r9, 48(%rdx) -; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) -; FALLBACK9-NEXT: movq %rbx, 40(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r14, (%rdx) -; FALLBACK9-NEXT: popq %rbx -; FALLBACK9-NEXT: popq %r14 -; FALLBACK9-NEXT: popq %r15 -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: lshr_64bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp -; FALLBACK10-NEXT: pushq %r15 -; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 -; FALLBACK10-NEXT: pushq %r12 -; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax -; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi -; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 -; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp -; FALLBACK10-NEXT: popq %rbx -; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 -; FALLBACK10-NEXT: popq %r14 -; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: lshr_64bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: pushq %r15 -; FALLBACK11-NEXT: pushq %r14 -; FALLBACK11-NEXT: pushq %rbx -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK11-NEXT: movl (%rsi), %eax -; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx -; FALLBACK11-NEXT: andl $56, %eax -; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq %r9, %rsi -; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK11-NEXT: movq %r10, %r8 -; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK11-NEXT: movq %r11, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r15 -; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK11-NEXT: movq %r15, 8(%rdx) -; FALLBACK11-NEXT: movq %r9, 48(%rdx) -; FALLBACK11-NEXT: movq %rdi, 32(%rdx) -; FALLBACK11-NEXT: movq %rbx, 40(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rsi, 24(%rdx) -; FALLBACK11-NEXT: movq %r14, (%rdx) -; FALLBACK11-NEXT: movq %r10, 56(%rdx) -; FALLBACK11-NEXT: popq %rbx -; FALLBACK11-NEXT: popq %r14 -; FALLBACK11-NEXT: popq %r15 -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: lshr_64bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbp -; FALLBACK12-NEXT: pushq %r15 -; FALLBACK12-NEXT: pushq %r14 -; FALLBACK12-NEXT: pushq %r13 -; FALLBACK12-NEXT: pushq %r12 -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: pushq %rax -; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %r9d -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%r9,8), %eax -; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %r9d -; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 -; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx -; FALLBACK12-NEXT: movq %rbx, %r14 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 -; FALLBACK12-NEXT: movq %r14, %r13 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp -; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: orq %r13, %r15 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: addq %r14, %r14 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: orq %r12, %r14 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: orq %rbp, %r12 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: addq %rbx, %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r8, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 56(%rdx) -; FALLBACK12-NEXT: movq %rbx, 8(%rdx) -; FALLBACK12-NEXT: movq %r12, 48(%rdx) -; FALLBACK12-NEXT: movq %r14, 32(%rdx) -; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) -; FALLBACK12-NEXT: movq %r11, 24(%rdx) -; FALLBACK12-NEXT: movq %rdi, (%rdx) -; FALLBACK12-NEXT: addq $8, %rsp -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: popq %r12 -; FALLBACK12-NEXT: popq %r13 -; FALLBACK12-NEXT: popq %r14 -; FALLBACK12-NEXT: popq %r15 -; FALLBACK12-NEXT: popq %rbp -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: lshr_64bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: pushq %r15 -; FALLBACK13-NEXT: pushq %r14 -; FALLBACK13-NEXT: pushq %rbx -; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK13-NEXT: movl (%rsi), %edi -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: leal (,%rdi,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx -; FALLBACK13-NEXT: andl $56, %edi -; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi -; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9 -; FALLBACK13-NEXT: movq %r9, %rax -; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax -; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10 -; FALLBACK13-NEXT: movq %r10, %r8 -; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9 -; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11 -; FALLBACK13-NEXT: movq %r11, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi -; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11 -; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14 -; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi -; FALLBACK13-NEXT: movq %rdi, %r15 -; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shrq %cl, %r11 -; FALLBACK13-NEXT: movq %r15, 8(%rdx) -; FALLBACK13-NEXT: movq %r9, 48(%rdx) -; FALLBACK13-NEXT: movq %r11, 56(%rdx) -; FALLBACK13-NEXT: movq %rsi, 32(%rdx) -; FALLBACK13-NEXT: movq %rbx, 40(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rax, 24(%rdx) -; FALLBACK13-NEXT: movq %r14, (%rdx) -; FALLBACK13-NEXT: popq %rbx -; FALLBACK13-NEXT: popq %r14 -; FALLBACK13-NEXT: popq %r15 -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: lshr_64bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp -; FALLBACK14-NEXT: pushq %r15 -; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 -; FALLBACK14-NEXT: pushq %r12 -; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax -; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK14-NEXT: movl (%rsi), %esi -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rsi,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx -; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax -; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10 -; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15 -; FALLBACK14-NEXT: movl %ecx, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 -; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 -; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp -; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rsi, %rsi -; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi -; FALLBACK14-NEXT: orq %r13, %rsi -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %rbp, %rax -; FALLBACK14-NEXT: movq %rcx, 56(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rsi, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp -; FALLBACK14-NEXT: popq %rbx -; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 -; FALLBACK14-NEXT: popq %r14 -; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: lshr_64bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: pushq %r15 -; FALLBACK15-NEXT: pushq %r14 -; FALLBACK15-NEXT: pushq %rbx -; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK15-NEXT: movq %r10, %r8 -; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK15-NEXT: movq %r11, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 -; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK15-NEXT: movq %r15, 8(%rdx) -; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) -; FALLBACK15-NEXT: movq %rbx, 40(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) -; FALLBACK15-NEXT: movq %r14, (%rdx) -; FALLBACK15-NEXT: movq %r10, 56(%rdx) -; FALLBACK15-NEXT: popq %rbx -; FALLBACK15-NEXT: popq %r14 -; FALLBACK15-NEXT: popq %r15 -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: lshr_64bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%eax), %ebp -; FALLBACK16-NEXT: movl 44(%eax), %ebx -; FALLBACK16-NEXT: movl 48(%eax), %edi -; FALLBACK16-NEXT: movl 52(%eax), %esi -; FALLBACK16-NEXT: movl 56(%eax), %edx -; FALLBACK16-NEXT: movl 60(%eax), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %eax -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %eax, %esi -; FALLBACK16-NEXT: andl $60, %esi -; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK16-NEXT: shll $3, %eax -; FALLBACK16-NEXT: andl $24, %eax -; FALLBACK16-NEXT: movl %edx, %edi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK16-NEXT: movb %al, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %edx, %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %edi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK16-NEXT: movl %edx, %ebp -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %ebp, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %edx, %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: addl %eax, %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %eax, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK16-NEXT: leal (%edx,%edx), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %ebp, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %edx, %edi -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK16-NEXT: movl %esi, %ebx -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK16-NEXT: leal (%eax,%eax), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %esi, %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %ebx, 60(%eax) -; FALLBACK16-NEXT: movl %edx, 56(%eax) -; FALLBACK16-NEXT: movl %esi, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) -; FALLBACK16-NEXT: movl %edi, 40(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 44(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 32(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 36(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 24(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, (%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: lshr_64bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $188, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%ecx), %ebp -; FALLBACK17-NEXT: movl 44(%ecx), %ebx -; FALLBACK17-NEXT: movl 48(%ecx), %edi -; FALLBACK17-NEXT: movl 52(%ecx), %esi -; FALLBACK17-NEXT: movl 56(%ecx), %edx -; FALLBACK17-NEXT: movl 60(%ecx), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %edi, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %edi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl %esi, %edx -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: shrl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 60(%ebp) -; FALLBACK17-NEXT: movl %esi, 48(%ebp) -; FALLBACK17-NEXT: movl %edi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $188, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: lshr_64bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $204, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx -; FALLBACK18-NEXT: movl 48(%eax), %edi -; FALLBACK18-NEXT: movl 52(%eax), %esi -; FALLBACK18-NEXT: movl 56(%eax), %edx -; FALLBACK18-NEXT: movl 60(%eax), %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %eax -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: leal (,%eax,8), %edx -; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: andl $60, %ecx -; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %edi -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl -; FALLBACK18-NEXT: leal (%eax,%eax), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) -; FALLBACK18-NEXT: addl $204, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: lshr_64bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $188, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%ecx), %eax -; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%ecx), %ebp -; FALLBACK19-NEXT: movl 44(%ecx), %ebx -; FALLBACK19-NEXT: movl 48(%ecx), %edi -; FALLBACK19-NEXT: movl 52(%ecx), %esi -; FALLBACK19-NEXT: movl 56(%ecx), %edx -; FALLBACK19-NEXT: movl 60(%ecx), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %ecx -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, %ebp -; FALLBACK19-NEXT: andl $60, %ebp -; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shll $3, %ecx -; FALLBACK19-NEXT: andl $24, %ecx -; FALLBACK19-NEXT: shrdl %cl, %edx, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edi, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %edi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl %edi, %edx -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %edi, %esi -; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 56(%ebp) -; FALLBACK19-NEXT: movl %esi, 48(%ebp) -; FALLBACK19-NEXT: movl %edx, 52(%ebp) -; FALLBACK19-NEXT: movl %ebx, 40(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 44(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 32(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 36(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 16(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %edx, %edi -; FALLBACK19-NEXT: movl %edi, (%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 4(%ebp) -; FALLBACK19-NEXT: movl %eax, 60(%ebp) -; FALLBACK19-NEXT: addl $188, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: lshr_64bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK20-NEXT: movl (%eax), %eax -; FALLBACK20-NEXT: xorps %xmm4, %xmm4 -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %esi -; FALLBACK20-NEXT: andl $60, %esi -; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %edi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %edi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %eax, %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK20-NEXT: leal (%edx,%edx), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: addl %edi, %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %eax, %edx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %ebx, 60(%eax) -; FALLBACK20-NEXT: movl %edx, 56(%eax) -; FALLBACK20-NEXT: movl %esi, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) -; FALLBACK20-NEXT: movl %edi, 40(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 44(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 32(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 36(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, (%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: lshr_64bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $188, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK21-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK21-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK21-NEXT: movl (%eax), %ecx -; FALLBACK21-NEXT: xorps %xmm4, %xmm4 -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx -; FALLBACK21-NEXT: shrdl %cl, %edx, %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %esi -; FALLBACK21-NEXT: shrdl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl %esi, %edx -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: shrl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 60(%ebp) -; FALLBACK21-NEXT: movl %esi, 48(%ebp) -; FALLBACK21-NEXT: movl %edi, 52(%ebp) -; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 4(%ebp) -; FALLBACK21-NEXT: addl $188, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: lshr_64bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $204, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %ecx -; FALLBACK22-NEXT: xorps %xmm4, %xmm4 -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%ecx,8), %edx -; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: andl $60, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %edi -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: leal (%eax,%eax), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx -; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, (%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) -; FALLBACK22-NEXT: addl $204, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: lshr_64bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $188, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK23-NEXT: movl (%eax), %ecx -; FALLBACK23-NEXT: xorps %xmm4, %xmm4 -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %ecx, %ebp -; FALLBACK23-NEXT: andl $60, %ebp -; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shll $3, %ecx -; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: shrdl %cl, %edx, %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %esi -; FALLBACK23-NEXT: shrdl %cl, %edi, %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %edi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl %edi, %edx -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %edi, %esi -; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK23-NEXT: movl %eax, 56(%ebp) -; FALLBACK23-NEXT: movl %esi, 48(%ebp) -; FALLBACK23-NEXT: movl %edx, 52(%ebp) -; FALLBACK23-NEXT: movl %ebx, 40(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 44(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 32(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 36(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 24(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 28(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 16(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 20(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 8(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 12(%ebp) -; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %edx, %edi -; FALLBACK23-NEXT: movl %edi, (%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 4(%ebp) -; FALLBACK23-NEXT: movl %eax, 60(%ebp) -; FALLBACK23-NEXT: addl $188, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: lshr_64bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK24-NEXT: movl (%eax), %ecx -; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, %esi -; FALLBACK24-NEXT: andl $60, %esi -; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK24-NEXT: shll $3, %ecx -; FALLBACK24-NEXT: andl $24, %ecx -; FALLBACK24-NEXT: movl %edx, %edi -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%eax,%eax), %ebx -; FALLBACK24-NEXT: movl %ecx, %ebp -; FALLBACK24-NEXT: movb %cl, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %edi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %eax, %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK24-NEXT: leal (%edx,%edx), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: addl %edi, %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %eax, %edx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %ebx, 60(%eax) -; FALLBACK24-NEXT: movl %edx, 56(%eax) -; FALLBACK24-NEXT: movl %esi, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) -; FALLBACK24-NEXT: movl %edi, 40(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 44(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 32(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 36(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, (%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: lshr_64bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $188, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK25-NEXT: movl (%eax), %ecx -; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx -; FALLBACK25-NEXT: shrdl %cl, %edx, %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %esi -; FALLBACK25-NEXT: shrdl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl %esi, %edx -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: shrl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 60(%ebp) -; FALLBACK25-NEXT: movl %esi, 48(%ebp) -; FALLBACK25-NEXT: movl %edi, 52(%ebp) -; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 4(%ebp) -; FALLBACK25-NEXT: addl $188, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: lshr_64bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $204, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK26-NEXT: movl (%eax), %ecx -; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: leal (,%ecx,8), %edx -; FALLBACK26-NEXT: andl $24, %edx -; FALLBACK26-NEXT: andl $60, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %edi -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: leal (%eax,%eax), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %eax, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %eax, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi -; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: movl %edx, 60(%ecx) -; FALLBACK26-NEXT: movl %ebx, 56(%ecx) -; FALLBACK26-NEXT: movl %edi, 48(%ecx) -; FALLBACK26-NEXT: movl %esi, 52(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 40(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 44(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 32(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 36(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 24(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 28(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 8(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, (%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 4(%ecx) -; FALLBACK26-NEXT: addl $204, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: lshr_64bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $188, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK27-NEXT: movl (%eax), %ecx -; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %ecx, %ebp -; FALLBACK27-NEXT: andl $60, %ebp -; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shll $3, %ecx -; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: shrdl %cl, %edx, %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %esi -; FALLBACK27-NEXT: shrdl %cl, %edi, %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %edi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl %edi, %edx -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %edi, %esi -; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK27-NEXT: movl %eax, 56(%ebp) -; FALLBACK27-NEXT: movl %esi, 48(%ebp) -; FALLBACK27-NEXT: movl %edx, 52(%ebp) -; FALLBACK27-NEXT: movl %ebx, 40(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 44(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 32(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 36(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 24(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 28(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 16(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 20(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 8(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 12(%ebp) -; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %edx, %edi -; FALLBACK27-NEXT: movl %edi, (%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 4(%ebp) -; FALLBACK27-NEXT: movl %eax, 60(%ebp) -; FALLBACK27-NEXT: addl $188, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: lshr_64bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK28-NEXT: movl (%eax), %ecx -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, %esi -; FALLBACK28-NEXT: andl $60, %esi -; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK28-NEXT: shll $3, %ecx -; FALLBACK28-NEXT: andl $24, %ecx -; FALLBACK28-NEXT: movl %edx, %edi -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%eax,%eax), %ebx -; FALLBACK28-NEXT: movl %ecx, %ebp -; FALLBACK28-NEXT: movb %cl, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %edi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %eax, %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK28-NEXT: leal (%edx,%edx), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: addl %edi, %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %eax, %edx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %ebx, 60(%eax) -; FALLBACK28-NEXT: movl %edx, 56(%eax) -; FALLBACK28-NEXT: movl %esi, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) -; FALLBACK28-NEXT: movl %edi, 40(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 44(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 32(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 36(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, (%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: lshr_64bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $188, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK29-NEXT: movl (%eax), %ecx -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx -; FALLBACK29-NEXT: shrdl %cl, %edx, %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %esi -; FALLBACK29-NEXT: shrdl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl %esi, %edx -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: shrl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 60(%ebp) -; FALLBACK29-NEXT: movl %esi, 48(%ebp) -; FALLBACK29-NEXT: movl %edi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 4(%ebp) -; FALLBACK29-NEXT: addl $188, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: lshr_64bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $204, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK30-NEXT: movl (%eax), %edx -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%edx,8), %ecx -; FALLBACK30-NEXT: andl $24, %ecx -; FALLBACK30-NEXT: andl $60, %edx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi -; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK30-NEXT: movl %ecx, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: leal (%eax,%eax), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %eax, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %eax, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi -; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax -; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax -; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx -; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp -; FALLBACK30-NEXT: leal (%edx,%edx), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx -; FALLBACK30-NEXT: orl %eax, %edx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: movl %ebp, 60(%ecx) -; FALLBACK30-NEXT: movl %edx, 56(%ecx) -; FALLBACK30-NEXT: movl %edi, 48(%ecx) -; FALLBACK30-NEXT: movl %esi, 52(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 40(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 44(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 32(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 36(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 24(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 28(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 8(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, (%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 4(%ecx) -; FALLBACK30-NEXT: addl $204, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: lshr_64bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $188, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK31-NEXT: movl (%eax), %ecx -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %ecx, %ebp -; FALLBACK31-NEXT: andl $60, %ebp -; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shll $3, %ecx -; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: shrdl %cl, %edx, %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %esi -; FALLBACK31-NEXT: shrdl %cl, %edi, %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %edi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl %edi, %edx -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %edi, %esi -; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK31-NEXT: movl %eax, 56(%ebp) -; FALLBACK31-NEXT: movl %esi, 48(%ebp) -; FALLBACK31-NEXT: movl %edx, 52(%ebp) -; FALLBACK31-NEXT: movl %ebx, 40(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 44(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 32(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 36(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 24(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 28(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 16(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 20(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 8(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 12(%ebp) -; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %edx, %edi -; FALLBACK31-NEXT: movl %edi, (%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 4(%ebp) -; FALLBACK31-NEXT: movl %eax, 60(%ebp) -; FALLBACK31-NEXT: addl $188, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %edi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rdi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %edi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r8, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rdi), %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r13,%r13), %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rdi,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r10, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r15,%r15), %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r12, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rax,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r14, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r12, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %r8d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%r8,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %r8d +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%r8), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%r8), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%r8), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%r8), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%r8), %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%r8), %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%r8), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rax,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %r9d +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%r9,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %r9d +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%r9), %rbp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r9,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r8, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rsi), %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rsi), %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%rsi,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %r9d +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%r9,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %r9d +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r8,%r8), %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%r9), %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%r9), %rbp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r9,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r8, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %edi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rdi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %edi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%rdi), %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rsi, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%rdi), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%rdi), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rsi), %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rsi), %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%rsi,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r9, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rcx, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebx,8), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 112(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 120(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 116(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 124(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%ecx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%ecx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%ecx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%ebx,8), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 112(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 120(%esp,%ebx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 116(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 124(%esp,%ebx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX1: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %cl, %ch +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%ecx,8), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 120(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 116(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 124(%esp,%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 60(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 56(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 48(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 52(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, 40(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %cl, %ch +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ecx,8), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %ebp, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 120(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 116(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 124(%esp,%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 48(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, (%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%ecx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 @@ -16020,3770 +13714,3774 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no } define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: shl_64bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %r15 -; FALLBACK0-NEXT: pushq %r14 -; FALLBACK0-NEXT: pushq %r13 -; FALLBACK0-NEXT: pushq %r12 -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %rdi -; FALLBACK0-NEXT: movl (%rsi), %esi -; FALLBACK0-NEXT: xorps %xmm0, %xmm0 -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax -; FALLBACK0-NEXT: andl $56, %esi -; FALLBACK0-NEXT: negl %esi -; FALLBACK0-NEXT: movslq %esi, %rbx -; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r8 -; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi -; FALLBACK0-NEXT: movq %rdi, %r10 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq %r8, %r9 -; FALLBACK0-NEXT: shrq %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r9 -; FALLBACK0-NEXT: orq %r10, %r9 -; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r10 -; FALLBACK0-NEXT: movq %r10, %r14 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r15 -; FALLBACK0-NEXT: movq %r15, %r11 -; FALLBACK0-NEXT: shrq %r11 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: orq %r14, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: shrq %rdi -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %rdi -; FALLBACK0-NEXT: orq %r15, %rdi -; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14 -; FALLBACK0-NEXT: movq %r14, %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r12 -; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13 -; FALLBACK0-NEXT: movq %r13, %r15 -; FALLBACK0-NEXT: shrq %r15 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r13 -; FALLBACK0-NEXT: shrq %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: orq %r13, %r10 -; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r12 -; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx -; FALLBACK0-NEXT: movq %rbx, %r13 -; FALLBACK0-NEXT: shrq %r13 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: orq %r12, %r13 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: shrq %r14 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: orq %rbx, %r14 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %r13, 56(%rdx) -; FALLBACK0-NEXT: movq %r10, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %rdi, 16(%rdx) -; FALLBACK0-NEXT: movq %r11, 24(%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: popq %r12 -; FALLBACK0-NEXT: popq %r13 -; FALLBACK0-NEXT: popq %r14 -; FALLBACK0-NEXT: popq %r15 -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: shl_64bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: pushq %r14 -; FALLBACK1-NEXT: pushq %rbx -; FALLBACK1-NEXT: pushq %rax -; FALLBACK1-NEXT: movq (%rdi), %rax -; FALLBACK1-NEXT: movq 8(%rdi), %rcx -; FALLBACK1-NEXT: movq 16(%rdi), %r8 -; FALLBACK1-NEXT: movq 24(%rdi), %r9 -; FALLBACK1-NEXT: movq 32(%rdi), %r10 -; FALLBACK1-NEXT: movq 40(%rdi), %r11 -; FALLBACK1-NEXT: movq 48(%rdi), %rbx -; FALLBACK1-NEXT: movq 56(%rdi), %rdi -; FALLBACK1-NEXT: movl (%rsi), %esi -; FALLBACK1-NEXT: xorps %xmm0, %xmm0 -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx -; FALLBACK1-NEXT: andl $56, %esi -; FALLBACK1-NEXT: negl %esi -; FALLBACK1-NEXT: movslq %esi, %r9 -; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rax -; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r10 -; FALLBACK1-NEXT: movq %r10, %rsi -; FALLBACK1-NEXT: shldq %cl, %rax, %rsi -; FALLBACK1-NEXT: movq -64(%rsp,%r9), %r8 -; FALLBACK1-NEXT: movq -56(%rsp,%r9), %rdi -; FALLBACK1-NEXT: shldq %cl, %rdi, %rax -; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx -; FALLBACK1-NEXT: movq %rbx, %r14 -; FALLBACK1-NEXT: shldq %cl, %r11, %r14 -; FALLBACK1-NEXT: shldq %cl, %r10, %r11 -; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r10 -; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK1-NEXT: shldq %cl, %r10, %r9 -; FALLBACK1-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK1-NEXT: shldq %cl, %r8, %rdi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: shlq %cl, %r8 -; FALLBACK1-NEXT: movq %r10, 48(%rdx) -; FALLBACK1-NEXT: movq %r9, 56(%rdx) -; FALLBACK1-NEXT: movq %r11, 32(%rdx) -; FALLBACK1-NEXT: movq %r14, 40(%rdx) -; FALLBACK1-NEXT: movq %rax, 16(%rdx) -; FALLBACK1-NEXT: movq %rsi, 24(%rdx) -; FALLBACK1-NEXT: movq %r8, (%rdx) -; FALLBACK1-NEXT: movq %rdi, 8(%rdx) -; FALLBACK1-NEXT: addq $8, %rsp -; FALLBACK1-NEXT: popq %rbx -; FALLBACK1-NEXT: popq %r14 -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: shl_64bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp -; FALLBACK2-NEXT: pushq %r15 -; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 -; FALLBACK2-NEXT: pushq %r12 -; FALLBACK2-NEXT: pushq %rbx -; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rax -; FALLBACK2-NEXT: movq 8(%rdi), %rcx -; FALLBACK2-NEXT: movq 16(%rdi), %r8 -; FALLBACK2-NEXT: movq 24(%rdi), %r9 -; FALLBACK2-NEXT: movq 32(%rdi), %r10 -; FALLBACK2-NEXT: movq 40(%rdi), %r11 -; FALLBACK2-NEXT: movq 48(%rdi), %rbx -; FALLBACK2-NEXT: movq 56(%rdi), %rdi -; FALLBACK2-NEXT: movl (%rsi), %esi -; FALLBACK2-NEXT: xorps %xmm0, %xmm0 -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: andl $56, %esi -; FALLBACK2-NEXT: negl %esi -; FALLBACK2-NEXT: movslq %esi, %rsi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 -; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 -; FALLBACK2-NEXT: movl %eax, %r13d -; FALLBACK2-NEXT: notb %r13b -; FALLBACK2-NEXT: shrq %r10 -; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp -; FALLBACK2-NEXT: shrq %r14 -; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 -; FALLBACK2-NEXT: orq %r11, %r14 -; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx -; FALLBACK2-NEXT: orq %rbx, %rcx -; FALLBACK2-NEXT: shrq %r9 -; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 -; FALLBACK2-NEXT: orq %r15, %r9 -; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi -; FALLBACK2-NEXT: orq %rbp, %rdi -; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 -; FALLBACK2-NEXT: orq %rax, %r8 -; FALLBACK2-NEXT: movq %r12, (%rdx) -; FALLBACK2-NEXT: movq %r8, 48(%rdx) -; FALLBACK2-NEXT: movq %rsi, 56(%rdx) -; FALLBACK2-NEXT: movq %rdi, 32(%rdx) -; FALLBACK2-NEXT: movq %r9, 40(%rdx) -; FALLBACK2-NEXT: movq %rcx, 16(%rdx) -; FALLBACK2-NEXT: movq %r14, 24(%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) -; FALLBACK2-NEXT: addq $8, %rsp -; FALLBACK2-NEXT: popq %rbx -; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 -; FALLBACK2-NEXT: popq %r14 -; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: shl_64bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: pushq %r14 -; FALLBACK3-NEXT: pushq %rbx -; FALLBACK3-NEXT: pushq %rax -; FALLBACK3-NEXT: movq (%rdi), %rax -; FALLBACK3-NEXT: movq 8(%rdi), %rcx -; FALLBACK3-NEXT: movq 16(%rdi), %r8 -; FALLBACK3-NEXT: movq 24(%rdi), %r9 -; FALLBACK3-NEXT: movq 32(%rdi), %r10 -; FALLBACK3-NEXT: movq 40(%rdi), %r11 -; FALLBACK3-NEXT: movq 48(%rdi), %rbx -; FALLBACK3-NEXT: movq 56(%rdi), %rdi -; FALLBACK3-NEXT: movl (%rsi), %esi -; FALLBACK3-NEXT: xorps %xmm0, %xmm0 -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx -; FALLBACK3-NEXT: andl $56, %esi -; FALLBACK3-NEXT: negl %esi -; FALLBACK3-NEXT: movslq %esi, %r8 -; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax -; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r9 -; FALLBACK3-NEXT: movq %r9, %rsi -; FALLBACK3-NEXT: shldq %cl, %rax, %rsi -; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r10 -; FALLBACK3-NEXT: movq -56(%rsp,%r8), %rdi -; FALLBACK3-NEXT: shldq %cl, %rdi, %rax -; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r11 -; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx -; FALLBACK3-NEXT: movq %rbx, %r14 -; FALLBACK3-NEXT: shldq %cl, %r11, %r14 -; FALLBACK3-NEXT: shldq %cl, %r9, %r11 -; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r9 -; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK3-NEXT: shldq %cl, %r9, %r8 -; FALLBACK3-NEXT: shldq %cl, %rbx, %r9 -; FALLBACK3-NEXT: shldq %cl, %r10, %rdi -; FALLBACK3-NEXT: shlxq %rcx, %r10, %rcx -; FALLBACK3-NEXT: movq %r9, 48(%rdx) -; FALLBACK3-NEXT: movq %r8, 56(%rdx) -; FALLBACK3-NEXT: movq %r11, 32(%rdx) -; FALLBACK3-NEXT: movq %r14, 40(%rdx) -; FALLBACK3-NEXT: movq %rax, 16(%rdx) -; FALLBACK3-NEXT: movq %rsi, 24(%rdx) -; FALLBACK3-NEXT: movq %rcx, (%rdx) -; FALLBACK3-NEXT: movq %rdi, 8(%rdx) -; FALLBACK3-NEXT: addq $8, %rsp -; FALLBACK3-NEXT: popq %rbx -; FALLBACK3-NEXT: popq %r14 -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: shl_64bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %r15 -; FALLBACK4-NEXT: pushq %r14 -; FALLBACK4-NEXT: pushq %r13 -; FALLBACK4-NEXT: pushq %r12 -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK4-NEXT: movl (%rsi), %ecx -; FALLBACK4-NEXT: xorps %xmm4, %xmm4 -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: leal (,%rcx,8), %eax -; FALLBACK4-NEXT: andl $56, %eax -; FALLBACK4-NEXT: andl $56, %ecx -; FALLBACK4-NEXT: negl %ecx -; FALLBACK4-NEXT: movslq %ecx, %r9 -; FALLBACK4-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK4-NEXT: movq %rdi, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK4-NEXT: movq %r11, %r8 -; FALLBACK4-NEXT: shrq %r8 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r8 -; FALLBACK4-NEXT: orq %r10, %r8 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK4-NEXT: movq %rbx, %r10 -; FALLBACK4-NEXT: shrq %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: orq %r11, %r10 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK4-NEXT: movq %r15, %r11 -; FALLBACK4-NEXT: shrq %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK4-NEXT: movq %r12, %rbx -; FALLBACK4-NEXT: shrq %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: orq %r15, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: movq %r14, %r15 -; FALLBACK4-NEXT: shrq %r15 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r15 -; FALLBACK4-NEXT: orq %r12, %r15 -; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK4-NEXT: movq %r12, %r13 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r13 -; FALLBACK4-NEXT: shrq %rdi -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %rdi -; FALLBACK4-NEXT: orq %r13, %rdi -; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r9 -; FALLBACK4-NEXT: shrq %r12 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: orq %r9, %r12 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: movq %r14, (%rdx) -; FALLBACK4-NEXT: movq %r12, 56(%rdx) -; FALLBACK4-NEXT: movq %rdi, 48(%rdx) -; FALLBACK4-NEXT: movq %r15, 8(%rdx) -; FALLBACK4-NEXT: movq %rbx, 16(%rdx) -; FALLBACK4-NEXT: movq %r11, 24(%rdx) -; FALLBACK4-NEXT: movq %r10, 32(%rdx) -; FALLBACK4-NEXT: movq %r8, 40(%rdx) -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: popq %r12 -; FALLBACK4-NEXT: popq %r13 -; FALLBACK4-NEXT: popq %r14 -; FALLBACK4-NEXT: popq %r15 -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: shl_64bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: pushq %r15 -; FALLBACK5-NEXT: pushq %r14 -; FALLBACK5-NEXT: pushq %rbx -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK5-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK5-NEXT: movl (%rsi), %eax -; FALLBACK5-NEXT: xorps %xmm4, %xmm4 -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx -; FALLBACK5-NEXT: andl $56, %eax -; FALLBACK5-NEXT: negl %eax -; FALLBACK5-NEXT: movslq %eax, %r8 -; FALLBACK5-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK5-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK5-NEXT: movq %r9, %rsi -; FALLBACK5-NEXT: shldq %cl, %rax, %rsi -; FALLBACK5-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK5-NEXT: shldq %cl, %rdi, %rax -; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK5-NEXT: shldq %cl, %r10, %rdi -; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK5-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK5-NEXT: movq %r14, %r15 -; FALLBACK5-NEXT: shldq %cl, %r9, %r15 -; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK5-NEXT: shldq %cl, %r14, %r8 -; FALLBACK5-NEXT: movq %r11, %r9 -; FALLBACK5-NEXT: shlq %cl, %r9 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: shldq %cl, %r11, %rbx -; FALLBACK5-NEXT: movq %r8, 56(%rdx) -; FALLBACK5-NEXT: movq %r15, 48(%rdx) -; FALLBACK5-NEXT: movq %rbx, 8(%rdx) -; FALLBACK5-NEXT: movq %r10, 16(%rdx) -; FALLBACK5-NEXT: movq %rdi, 24(%rdx) -; FALLBACK5-NEXT: movq %rax, 32(%rdx) -; FALLBACK5-NEXT: movq %rsi, 40(%rdx) -; FALLBACK5-NEXT: movq %r9, (%rdx) -; FALLBACK5-NEXT: popq %rbx -; FALLBACK5-NEXT: popq %r14 -; FALLBACK5-NEXT: popq %r15 -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: shl_64bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp -; FALLBACK6-NEXT: pushq %r15 -; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 -; FALLBACK6-NEXT: pushq %r12 -; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: subq $24, %rsp -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK6-NEXT: movl (%rsi), %eax -; FALLBACK6-NEXT: xorps %xmm4, %xmm4 -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm3, (%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %ecx -; FALLBACK6-NEXT: andl $56, %ecx -; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: negl %eax -; FALLBACK6-NEXT: movslq %eax, %rsi -; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK6-NEXT: movl %ecx, %r9d -; FALLBACK6-NEXT: notb %r9b -; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi -; FALLBACK6-NEXT: orq %r12, %rdi -; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK6-NEXT: shrq %r13 -; FALLBACK6-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK6-NEXT: orq %r15, %r12 -; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK6-NEXT: shrq %r11 -; FALLBACK6-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK6-NEXT: shrq %r14 -; FALLBACK6-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK6-NEXT: orq %r10, %r14 -; FALLBACK6-NEXT: shrq %rsi -; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK6-NEXT: orq %rbx, %rsi -; FALLBACK6-NEXT: shrq %rax -; FALLBACK6-NEXT: shrxq %r9, %rax, %rax -; FALLBACK6-NEXT: orq %r8, %rax -; FALLBACK6-NEXT: shrq %rbp -; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK6-NEXT: orq %r15, %r8 -; FALLBACK6-NEXT: movq %rcx, (%rdx) -; FALLBACK6-NEXT: movq %r8, 56(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %rsi, 8(%rdx) -; FALLBACK6-NEXT: movq %r14, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r12, 32(%rdx) -; FALLBACK6-NEXT: movq %rdi, 40(%rdx) -; FALLBACK6-NEXT: addq $24, %rsp -; FALLBACK6-NEXT: popq %rbx -; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 -; FALLBACK6-NEXT: popq %r14 -; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: shl_64bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: pushq %r15 -; FALLBACK7-NEXT: pushq %r14 -; FALLBACK7-NEXT: pushq %rbx -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK7-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK7-NEXT: movl (%rsi), %eax -; FALLBACK7-NEXT: xorps %xmm4, %xmm4 -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx -; FALLBACK7-NEXT: andl $56, %eax -; FALLBACK7-NEXT: negl %eax -; FALLBACK7-NEXT: movslq %eax, %r8 -; FALLBACK7-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK7-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK7-NEXT: movq %r9, %rsi -; FALLBACK7-NEXT: shldq %cl, %rax, %rsi -; FALLBACK7-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK7-NEXT: shldq %cl, %rdi, %rax -; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK7-NEXT: shldq %cl, %r10, %rdi -; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK7-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK7-NEXT: movq %r14, %r15 -; FALLBACK7-NEXT: shldq %cl, %r9, %r15 -; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK7-NEXT: shldq %cl, %r14, %r8 -; FALLBACK7-NEXT: shlxq %rcx, %r11, %r9 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shldq %cl, %r11, %rbx -; FALLBACK7-NEXT: movq %r8, 56(%rdx) -; FALLBACK7-NEXT: movq %r15, 48(%rdx) -; FALLBACK7-NEXT: movq %rbx, 8(%rdx) -; FALLBACK7-NEXT: movq %r10, 16(%rdx) -; FALLBACK7-NEXT: movq %rdi, 24(%rdx) -; FALLBACK7-NEXT: movq %rax, 32(%rdx) -; FALLBACK7-NEXT: movq %rsi, 40(%rdx) -; FALLBACK7-NEXT: movq %r9, (%rdx) -; FALLBACK7-NEXT: popq %rbx -; FALLBACK7-NEXT: popq %r14 -; FALLBACK7-NEXT: popq %r15 -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: shl_64bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %r15 -; FALLBACK8-NEXT: pushq %r14 -; FALLBACK8-NEXT: pushq %r13 -; FALLBACK8-NEXT: pushq %r12 -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK8-NEXT: movl (%rsi), %ecx -; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: leal (,%rcx,8), %eax -; FALLBACK8-NEXT: andl $56, %eax -; FALLBACK8-NEXT: andl $56, %ecx -; FALLBACK8-NEXT: negl %ecx -; FALLBACK8-NEXT: movslq %ecx, %r9 -; FALLBACK8-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK8-NEXT: movq %rdi, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK8-NEXT: movq %r11, %r8 -; FALLBACK8-NEXT: shrq %r8 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r8 -; FALLBACK8-NEXT: orq %r10, %r8 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK8-NEXT: movq %rbx, %r10 -; FALLBACK8-NEXT: shrq %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: orq %r11, %r10 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK8-NEXT: movq %r15, %r11 -; FALLBACK8-NEXT: shrq %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK8-NEXT: movq %r12, %rbx -; FALLBACK8-NEXT: shrq %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: orq %r15, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: movq %r14, %r15 -; FALLBACK8-NEXT: shrq %r15 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r15 -; FALLBACK8-NEXT: orq %r12, %r15 -; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK8-NEXT: movq %r12, %r13 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r13 -; FALLBACK8-NEXT: shrq %rdi -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %rdi -; FALLBACK8-NEXT: orq %r13, %rdi -; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r9 -; FALLBACK8-NEXT: shrq %r12 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: orq %r9, %r12 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: movq %r14, (%rdx) -; FALLBACK8-NEXT: movq %r12, 56(%rdx) -; FALLBACK8-NEXT: movq %rdi, 48(%rdx) -; FALLBACK8-NEXT: movq %r15, 8(%rdx) -; FALLBACK8-NEXT: movq %rbx, 16(%rdx) -; FALLBACK8-NEXT: movq %r11, 24(%rdx) -; FALLBACK8-NEXT: movq %r10, 32(%rdx) -; FALLBACK8-NEXT: movq %r8, 40(%rdx) -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: popq %r12 -; FALLBACK8-NEXT: popq %r13 -; FALLBACK8-NEXT: popq %r14 -; FALLBACK8-NEXT: popq %r15 -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: shl_64bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: pushq %r15 -; FALLBACK9-NEXT: pushq %r14 -; FALLBACK9-NEXT: pushq %rbx -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax -; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: negl %eax -; FALLBACK9-NEXT: movslq %eax, %r8 -; FALLBACK9-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK9-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shldq %cl, %rax, %rsi -; FALLBACK9-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK9-NEXT: shldq %cl, %rdi, %rax -; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK9-NEXT: shldq %cl, %r10, %rdi -; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK9-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK9-NEXT: movq %r14, %r15 -; FALLBACK9-NEXT: shldq %cl, %r9, %r15 -; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK9-NEXT: shldq %cl, %r14, %r8 -; FALLBACK9-NEXT: movq %r11, %r9 -; FALLBACK9-NEXT: shlq %cl, %r9 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: shldq %cl, %r11, %rbx -; FALLBACK9-NEXT: movq %r8, 56(%rdx) -; FALLBACK9-NEXT: movq %r15, 48(%rdx) -; FALLBACK9-NEXT: movq %rbx, 8(%rdx) -; FALLBACK9-NEXT: movq %r10, 16(%rdx) -; FALLBACK9-NEXT: movq %rdi, 24(%rdx) -; FALLBACK9-NEXT: movq %rax, 32(%rdx) -; FALLBACK9-NEXT: movq %rsi, 40(%rdx) -; FALLBACK9-NEXT: movq %r9, (%rdx) -; FALLBACK9-NEXT: popq %rbx -; FALLBACK9-NEXT: popq %r14 -; FALLBACK9-NEXT: popq %r15 -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: shl_64bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp -; FALLBACK10-NEXT: pushq %r15 -; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 -; FALLBACK10-NEXT: pushq %r12 -; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: subq $24, %rsp -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax -; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %ecx -; FALLBACK10-NEXT: andl $56, %ecx -; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: negl %eax -; FALLBACK10-NEXT: movslq %eax, %rsi -; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK10-NEXT: movl %ecx, %r9d -; FALLBACK10-NEXT: notb %r9b -; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi -; FALLBACK10-NEXT: orq %r12, %rdi -; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK10-NEXT: shrq %r13 -; FALLBACK10-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK10-NEXT: orq %r15, %r12 -; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK10-NEXT: shrq %r11 -; FALLBACK10-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK10-NEXT: shrq %r14 -; FALLBACK10-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK10-NEXT: orq %r10, %r14 -; FALLBACK10-NEXT: shrq %rsi -; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK10-NEXT: orq %rbx, %rsi -; FALLBACK10-NEXT: shrq %rax -; FALLBACK10-NEXT: shrxq %r9, %rax, %rax -; FALLBACK10-NEXT: orq %r8, %rax -; FALLBACK10-NEXT: shrq %rbp -; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK10-NEXT: orq %r15, %r8 -; FALLBACK10-NEXT: movq %rcx, (%rdx) -; FALLBACK10-NEXT: movq %r8, 56(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %rsi, 8(%rdx) -; FALLBACK10-NEXT: movq %r14, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r12, 32(%rdx) -; FALLBACK10-NEXT: movq %rdi, 40(%rdx) -; FALLBACK10-NEXT: addq $24, %rsp -; FALLBACK10-NEXT: popq %rbx -; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 -; FALLBACK10-NEXT: popq %r14 -; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: shl_64bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: pushq %r15 -; FALLBACK11-NEXT: pushq %r14 -; FALLBACK11-NEXT: pushq %rbx -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK11-NEXT: movl (%rsi), %eax -; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx -; FALLBACK11-NEXT: andl $56, %eax -; FALLBACK11-NEXT: negl %eax -; FALLBACK11-NEXT: movslq %eax, %r8 -; FALLBACK11-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK11-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK11-NEXT: movq %r9, %rsi -; FALLBACK11-NEXT: shldq %cl, %rax, %rsi -; FALLBACK11-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK11-NEXT: shldq %cl, %rdi, %rax -; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK11-NEXT: shldq %cl, %r10, %rdi -; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK11-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK11-NEXT: movq %r14, %r15 -; FALLBACK11-NEXT: shldq %cl, %r9, %r15 -; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK11-NEXT: shldq %cl, %r14, %r8 -; FALLBACK11-NEXT: shlxq %rcx, %r11, %r9 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shldq %cl, %r11, %rbx -; FALLBACK11-NEXT: movq %r8, 56(%rdx) -; FALLBACK11-NEXT: movq %r15, 48(%rdx) -; FALLBACK11-NEXT: movq %rbx, 8(%rdx) -; FALLBACK11-NEXT: movq %r10, 16(%rdx) -; FALLBACK11-NEXT: movq %rdi, 24(%rdx) -; FALLBACK11-NEXT: movq %rax, 32(%rdx) -; FALLBACK11-NEXT: movq %rsi, 40(%rdx) -; FALLBACK11-NEXT: movq %r9, (%rdx) -; FALLBACK11-NEXT: popq %rbx -; FALLBACK11-NEXT: popq %r14 -; FALLBACK11-NEXT: popq %r15 -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: shl_64bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %r15 -; FALLBACK12-NEXT: pushq %r14 -; FALLBACK12-NEXT: pushq %r13 -; FALLBACK12-NEXT: pushq %r12 -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %ecx -; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%rcx,8), %eax -; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %ecx -; FALLBACK12-NEXT: negl %ecx -; FALLBACK12-NEXT: movslq %ecx, %r9 -; FALLBACK12-NEXT: movq -24(%rsp,%r9), %rdi -; FALLBACK12-NEXT: movq %rdi, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11 -; FALLBACK12-NEXT: movq %r11, %r8 -; FALLBACK12-NEXT: shrq %r8 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r8 -; FALLBACK12-NEXT: orq %r10, %r8 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx -; FALLBACK12-NEXT: movq %rbx, %r10 -; FALLBACK12-NEXT: shrq %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: orq %r11, %r10 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15 -; FALLBACK12-NEXT: movq %r15, %r11 -; FALLBACK12-NEXT: shrq %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14 -; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12 -; FALLBACK12-NEXT: movq %r12, %rbx -; FALLBACK12-NEXT: shrq %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: orq %r15, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: movq %r14, %r15 -; FALLBACK12-NEXT: shrq %r15 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r15 -; FALLBACK12-NEXT: orq %r12, %r15 -; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12 -; FALLBACK12-NEXT: movq %r12, %r13 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r13 -; FALLBACK12-NEXT: shrq %rdi -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %rdi -; FALLBACK12-NEXT: orq %r13, %rdi -; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r9 -; FALLBACK12-NEXT: shrq %r12 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: orq %r9, %r12 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: movq %r14, (%rdx) -; FALLBACK12-NEXT: movq %r12, 56(%rdx) -; FALLBACK12-NEXT: movq %rdi, 48(%rdx) -; FALLBACK12-NEXT: movq %r15, 8(%rdx) -; FALLBACK12-NEXT: movq %rbx, 16(%rdx) -; FALLBACK12-NEXT: movq %r11, 24(%rdx) -; FALLBACK12-NEXT: movq %r10, 32(%rdx) -; FALLBACK12-NEXT: movq %r8, 40(%rdx) -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: popq %r12 -; FALLBACK12-NEXT: popq %r13 -; FALLBACK12-NEXT: popq %r14 -; FALLBACK12-NEXT: popq %r15 -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: shl_64bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: pushq %r15 -; FALLBACK13-NEXT: pushq %r14 -; FALLBACK13-NEXT: pushq %rbx -; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK13-NEXT: movl (%rsi), %eax -; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx -; FALLBACK13-NEXT: andl $56, %eax -; FALLBACK13-NEXT: negl %eax -; FALLBACK13-NEXT: movslq %eax, %r8 -; FALLBACK13-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK13-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK13-NEXT: movq %r9, %rsi -; FALLBACK13-NEXT: shldq %cl, %rax, %rsi -; FALLBACK13-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK13-NEXT: shldq %cl, %rdi, %rax -; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK13-NEXT: shldq %cl, %r10, %rdi -; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK13-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK13-NEXT: movq %r14, %r15 -; FALLBACK13-NEXT: shldq %cl, %r9, %r15 -; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK13-NEXT: shldq %cl, %r14, %r8 -; FALLBACK13-NEXT: movq %r11, %r9 -; FALLBACK13-NEXT: shlq %cl, %r9 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: shldq %cl, %r11, %rbx -; FALLBACK13-NEXT: movq %r8, 56(%rdx) -; FALLBACK13-NEXT: movq %r15, 48(%rdx) -; FALLBACK13-NEXT: movq %rbx, 8(%rdx) -; FALLBACK13-NEXT: movq %r10, 16(%rdx) -; FALLBACK13-NEXT: movq %rdi, 24(%rdx) -; FALLBACK13-NEXT: movq %rax, 32(%rdx) -; FALLBACK13-NEXT: movq %rsi, 40(%rdx) -; FALLBACK13-NEXT: movq %r9, (%rdx) -; FALLBACK13-NEXT: popq %rbx -; FALLBACK13-NEXT: popq %r14 -; FALLBACK13-NEXT: popq %r15 -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: shl_64bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp -; FALLBACK14-NEXT: pushq %r15 -; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 -; FALLBACK14-NEXT: pushq %r12 -; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: subq $24, %rsp -; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK14-NEXT: movl (%rsi), %eax -; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx -; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: negl %eax -; FALLBACK14-NEXT: movslq %eax, %rsi -; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK14-NEXT: movl %ecx, %r9d -; FALLBACK14-NEXT: notb %r9b -; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi -; FALLBACK14-NEXT: orq %r12, %rdi -; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK14-NEXT: shrq %r13 -; FALLBACK14-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK14-NEXT: orq %r15, %r12 -; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: shrq %r11 -; FALLBACK14-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK14-NEXT: shrq %r14 -; FALLBACK14-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK14-NEXT: orq %r10, %r14 -; FALLBACK14-NEXT: shrq %rsi -; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK14-NEXT: orq %rbx, %rsi -; FALLBACK14-NEXT: shrq %rax -; FALLBACK14-NEXT: shrxq %r9, %rax, %rax -; FALLBACK14-NEXT: orq %r8, %rax -; FALLBACK14-NEXT: shrq %rbp -; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK14-NEXT: orq %r15, %r8 -; FALLBACK14-NEXT: movq %rcx, (%rdx) -; FALLBACK14-NEXT: movq %r8, 56(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %rsi, 8(%rdx) -; FALLBACK14-NEXT: movq %r14, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r12, 32(%rdx) -; FALLBACK14-NEXT: movq %rdi, 40(%rdx) -; FALLBACK14-NEXT: addq $24, %rsp -; FALLBACK14-NEXT: popq %rbx -; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 -; FALLBACK14-NEXT: popq %r14 -; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: shl_64bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: pushq %r15 -; FALLBACK15-NEXT: pushq %r14 -; FALLBACK15-NEXT: pushq %rbx -; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax -; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: negl %eax -; FALLBACK15-NEXT: movslq %eax, %r8 -; FALLBACK15-NEXT: movq -32(%rsp,%r8), %rax -; FALLBACK15-NEXT: movq -24(%rsp,%r8), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shldq %cl, %rax, %rsi -; FALLBACK15-NEXT: movq -40(%rsp,%r8), %rdi -; FALLBACK15-NEXT: shldq %cl, %rdi, %rax -; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r10 -; FALLBACK15-NEXT: shldq %cl, %r10, %rdi -; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11 -; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx -; FALLBACK15-NEXT: shldq %cl, %rbx, %r10 -; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14 -; FALLBACK15-NEXT: movq %r14, %r15 -; FALLBACK15-NEXT: shldq %cl, %r9, %r15 -; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8 -; FALLBACK15-NEXT: shldq %cl, %r14, %r8 -; FALLBACK15-NEXT: shlxq %rcx, %r11, %r9 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shldq %cl, %r11, %rbx -; FALLBACK15-NEXT: movq %r8, 56(%rdx) -; FALLBACK15-NEXT: movq %r15, 48(%rdx) -; FALLBACK15-NEXT: movq %rbx, 8(%rdx) -; FALLBACK15-NEXT: movq %r10, 16(%rdx) -; FALLBACK15-NEXT: movq %rdi, 24(%rdx) -; FALLBACK15-NEXT: movq %rax, 32(%rdx) -; FALLBACK15-NEXT: movq %rsi, 40(%rdx) -; FALLBACK15-NEXT: movq %r9, (%rdx) -; FALLBACK15-NEXT: popq %rbx -; FALLBACK15-NEXT: popq %r14 -; FALLBACK15-NEXT: popq %r15 -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: shl_64bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%eax), %ebp -; FALLBACK16-NEXT: movl 44(%eax), %ebx -; FALLBACK16-NEXT: movl 48(%eax), %edi -; FALLBACK16-NEXT: movl 52(%eax), %esi -; FALLBACK16-NEXT: movl 56(%eax), %edx -; FALLBACK16-NEXT: movl 60(%eax), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %eax -; FALLBACK16-NEXT: xorps %xmm0, %xmm0 -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: andl $60, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: subl %edx, %ecx -; FALLBACK16-NEXT: movl (%ecx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %edx -; FALLBACK16-NEXT: movl %ecx, %ebp -; FALLBACK16-NEXT: shll $3, %eax -; FALLBACK16-NEXT: andl $24, %eax -; FALLBACK16-NEXT: movl %edx, %esi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %al, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 8(%ebp), %esi -; FALLBACK16-NEXT: movl %ebp, %edi -; FALLBACK16-NEXT: movl %esi, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: movl 20(%edi), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 16(%edi), %esi -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movl 28(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 24(%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %esi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%edx), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 32(%edx), %esi -; FALLBACK16-NEXT: movl %edx, %ebp -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 44(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 40(%ebp), %esi -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 52(%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: negl %edx -; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%edi), %edx -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl 56(%edi), %ebx -; FALLBACK16-NEXT: movl %ebx, %edi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %edx, %edi -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 56(%eax) -; FALLBACK16-NEXT: movl %edi, 60(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 40(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 44(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 32(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 36(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 24(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: shl_64bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $188, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%ecx), %eax -; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%ecx), %ebp -; FALLBACK17-NEXT: movl 44(%ecx), %ebx -; FALLBACK17-NEXT: movl 48(%ecx), %edi -; FALLBACK17-NEXT: movl 52(%ecx), %esi -; FALLBACK17-NEXT: movl 56(%ecx), %edx -; FALLBACK17-NEXT: movl 60(%ecx), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: xorps %xmm0, %xmm0 -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: subl %ebp, %eax -; FALLBACK17-NEXT: movl 8(%eax), %esi -; FALLBACK17-NEXT: movl 12(%eax), %edx -; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx -; FALLBACK17-NEXT: movl %edx, %edi -; FALLBACK17-NEXT: shldl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%eax), %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edi, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%eax), %edi -; FALLBACK17-NEXT: movl 20(%eax), %esi -; FALLBACK17-NEXT: movl %esi, %ebx -; FALLBACK17-NEXT: shldl %cl, %edi, %ebx -; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%eax), %edi -; FALLBACK17-NEXT: movl 28(%eax), %edx -; FALLBACK17-NEXT: movl %edx, %ebx -; FALLBACK17-NEXT: shldl %cl, %edi, %ebx -; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%eax), %edi -; FALLBACK17-NEXT: movl 36(%eax), %esi -; FALLBACK17-NEXT: movl %esi, %ebx -; FALLBACK17-NEXT: shldl %cl, %edi, %ebx -; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%eax), %edx -; FALLBACK17-NEXT: movl 44(%eax), %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 56(%eax), %edx -; FALLBACK17-NEXT: movl 60(%eax), %edi -; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl (%eax), %ebx -; FALLBACK17-NEXT: movl 52(%eax), %esi -; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: negl %ebp -; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl %edi, 60(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %ebx, %edx -; FALLBACK17-NEXT: shll %cl, %ebx -; FALLBACK17-NEXT: shldl %cl, %eax, %esi -; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %edi, %eax -; FALLBACK17-NEXT: movl %eax, 48(%ebp) -; FALLBACK17-NEXT: movl %esi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl %edx, 4(%ebp) -; FALLBACK17-NEXT: addl $188, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: shl_64bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $204, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebx -; FALLBACK18-NEXT: movl 44(%eax), %edi -; FALLBACK18-NEXT: movl 48(%eax), %esi -; FALLBACK18-NEXT: movl 52(%eax), %edx -; FALLBACK18-NEXT: movl 56(%eax), %ecx -; FALLBACK18-NEXT: movl 60(%eax), %eax -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK18-NEXT: movl (%ebp), %ebp -; FALLBACK18-NEXT: xorps %xmm0, %xmm0 -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: leal (,%ebp,8), %edx -; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: andl $60, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK18-NEXT: subl %ebp, %edi -; FALLBACK18-NEXT: movl (%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%edi), %esi -; FALLBACK18-NEXT: movl %esi, %ecx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 12(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 20(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 28(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 36(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 44(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%edi), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 52(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK18-NEXT: orl %eax, %ebp -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: negl %eax -; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK18-NEXT: movl 56(%edi), %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %edx -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %edx, %esi -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, (%eax) -; FALLBACK18-NEXT: movl %esi, 56(%eax) -; FALLBACK18-NEXT: movl %ecx, 60(%eax) -; FALLBACK18-NEXT: movl %ebp, 48(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) -; FALLBACK18-NEXT: addl $204, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: shl_64bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $204, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl (%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%ebp), %ebx -; FALLBACK19-NEXT: movl 44(%ebp), %edi -; FALLBACK19-NEXT: movl 48(%ebp), %esi -; FALLBACK19-NEXT: movl 52(%ebp), %edx -; FALLBACK19-NEXT: movl 56(%ebp), %ecx -; FALLBACK19-NEXT: movl 60(%ebp), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl (%ebp), %ebp -; FALLBACK19-NEXT: xorps %xmm0, %xmm0 -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: leal (,%ebp,8), %ecx -; FALLBACK19-NEXT: andl $24, %ecx -; FALLBACK19-NEXT: andl $60, %ebp -; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: subl %ebp, %eax -; FALLBACK19-NEXT: movl 4(%eax), %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%eax), %edi -; FALLBACK19-NEXT: movl 12(%eax), %edx -; FALLBACK19-NEXT: movl %edx, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %esi, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%eax), %edi -; FALLBACK19-NEXT: movl 20(%eax), %esi -; FALLBACK19-NEXT: movl %esi, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%eax), %edi -; FALLBACK19-NEXT: movl 28(%eax), %edx -; FALLBACK19-NEXT: movl %edx, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %esi, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%eax), %edi -; FALLBACK19-NEXT: movl 36(%eax), %esi -; FALLBACK19-NEXT: movl %esi, %ebx -; FALLBACK19-NEXT: shldl %cl, %edi, %ebx -; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%eax), %ebx -; FALLBACK19-NEXT: movl 44(%eax), %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shldl %cl, %esi, %ebx -; FALLBACK19-NEXT: movl 56(%eax), %edx -; FALLBACK19-NEXT: movl 60(%eax), %edi -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: movl (%eax), %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 52(%eax), %esi -; FALLBACK19-NEXT: shldl %cl, %esi, %edx -; FALLBACK19-NEXT: negl %ebp -; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl %edx, 56(%eax) -; FALLBACK19-NEXT: movl %edi, 60(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %edi -; FALLBACK19-NEXT: shldl %cl, %ebp, %esi -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shldl %cl, %edx, %ebp -; FALLBACK19-NEXT: movl %ebp, 48(%eax) -; FALLBACK19-NEXT: movl %esi, 52(%eax) -; FALLBACK19-NEXT: movl %ebx, 40(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 44(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 32(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 36(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 24(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 28(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 16(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 20(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 8(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 12(%eax) -; FALLBACK19-NEXT: movl %edi, 4(%eax) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, (%eax) -; FALLBACK19-NEXT: addl $204, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: shl_64bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK20-NEXT: movl (%eax), %eax -; FALLBACK20-NEXT: xorps %xmm4, %xmm4 -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: andl $60, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: subl %edx, %ecx -; FALLBACK20-NEXT: movl (%ecx), %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 4(%ecx), %edx -; FALLBACK20-NEXT: movl %ecx, %ebp -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %esi, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 12(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 8(%ebp), %esi -; FALLBACK20-NEXT: movl %ebp, %edi -; FALLBACK20-NEXT: movl %esi, %ebp -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl 20(%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 16(%edi), %esi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %esi, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %ebp, %edx -; FALLBACK20-NEXT: movl 28(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 24(%ebp), %esi -; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %esi, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 36(%edx), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 32(%edx), %esi -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 44(%ebp), %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 40(%ebp), %esi -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 52(%ebp), %esi -; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: negl %edx -; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %edi, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: movl 60(%edi), %edx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: movl 56(%edi), %ebx -; FALLBACK20-NEXT: movl %ebx, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %edx, (%eax) -; FALLBACK20-NEXT: movl %esi, 56(%eax) -; FALLBACK20-NEXT: movl %edi, 60(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 40(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 44(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 32(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 36(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: shl_64bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $188, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movups (%ecx), %xmm0 -; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK21-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK21-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK21-NEXT: movl (%eax), %ecx -; FALLBACK21-NEXT: xorps %xmm4, %xmm4 -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: subl %ebp, %eax -; FALLBACK21-NEXT: movl 8(%eax), %esi -; FALLBACK21-NEXT: movl 12(%eax), %edx -; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx -; FALLBACK21-NEXT: movl %edx, %edi -; FALLBACK21-NEXT: shldl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 4(%eax), %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 16(%eax), %edi -; FALLBACK21-NEXT: movl 20(%eax), %esi -; FALLBACK21-NEXT: movl %esi, %ebx -; FALLBACK21-NEXT: shldl %cl, %edi, %ebx -; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 24(%eax), %edi -; FALLBACK21-NEXT: movl 28(%eax), %edx -; FALLBACK21-NEXT: movl %edx, %ebx -; FALLBACK21-NEXT: shldl %cl, %edi, %ebx -; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 32(%eax), %edi -; FALLBACK21-NEXT: movl 36(%eax), %esi -; FALLBACK21-NEXT: movl %esi, %ebx -; FALLBACK21-NEXT: shldl %cl, %edi, %ebx -; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 40(%eax), %edx -; FALLBACK21-NEXT: movl 44(%eax), %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 56(%eax), %edx -; FALLBACK21-NEXT: movl 60(%eax), %edi -; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl (%eax), %ebx -; FALLBACK21-NEXT: movl 52(%eax), %esi -; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: negl %ebp -; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl %edi, 60(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %ebx, %edx -; FALLBACK21-NEXT: shll %cl, %ebx -; FALLBACK21-NEXT: shldl %cl, %eax, %esi -; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %edi, %eax -; FALLBACK21-NEXT: movl %eax, 48(%ebp) -; FALLBACK21-NEXT: movl %esi, 52(%ebp) -; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl %edx, 4(%ebp) -; FALLBACK21-NEXT: addl $188, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: shl_64bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $204, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %eax -; FALLBACK22-NEXT: xorps %xmm4, %xmm4 -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: andl $60, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK22-NEXT: subl %eax, %edi -; FALLBACK22-NEXT: movl (%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 4(%edi), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 8(%edi), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 12(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 16(%edi), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 20(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 24(%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 28(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 32(%edi), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 36(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 40(%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 44(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl 52(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK22-NEXT: orl %eax, %ebp -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: negl %eax -; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK22-NEXT: movl 56(%edi), %eax -; FALLBACK22-NEXT: shlxl %edx, %eax, %edx -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edx, %esi -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK22-NEXT: movl %edx, (%eax) -; FALLBACK22-NEXT: movl %esi, 56(%eax) -; FALLBACK22-NEXT: movl %ecx, 60(%eax) -; FALLBACK22-NEXT: movl %ebp, 48(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) -; FALLBACK22-NEXT: addl $204, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: shl_64bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $204, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movups (%ecx), %xmm0 -; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK23-NEXT: movl (%eax), %ebp -; FALLBACK23-NEXT: xorps %xmm4, %xmm4 -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: leal (,%ebp,8), %ecx -; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: andl $60, %ebp -; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: subl %ebp, %eax -; FALLBACK23-NEXT: movl 4(%eax), %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 8(%eax), %edi -; FALLBACK23-NEXT: movl 12(%eax), %edx -; FALLBACK23-NEXT: movl %edx, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %esi, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 16(%eax), %edi -; FALLBACK23-NEXT: movl 20(%eax), %esi -; FALLBACK23-NEXT: movl %esi, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 24(%eax), %edi -; FALLBACK23-NEXT: movl 28(%eax), %edx -; FALLBACK23-NEXT: movl %edx, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %esi, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 32(%eax), %edi -; FALLBACK23-NEXT: movl 36(%eax), %esi -; FALLBACK23-NEXT: movl %esi, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%eax), %ebx -; FALLBACK23-NEXT: movl 44(%eax), %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %ebx, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %esi, %ebx -; FALLBACK23-NEXT: movl 56(%eax), %edx -; FALLBACK23-NEXT: movl 60(%eax), %edi -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: movl (%eax), %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 52(%eax), %esi -; FALLBACK23-NEXT: shldl %cl, %esi, %edx -; FALLBACK23-NEXT: negl %ebp -; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movl %edx, 56(%eax) -; FALLBACK23-NEXT: movl %edi, 60(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: shldl %cl, %ebp, %esi -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %edx, %ebp -; FALLBACK23-NEXT: movl %ebp, 48(%eax) -; FALLBACK23-NEXT: movl %esi, 52(%eax) -; FALLBACK23-NEXT: movl %ebx, 40(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 44(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 32(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 36(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 24(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 28(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 16(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 20(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 8(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 12(%eax) -; FALLBACK23-NEXT: movl %edi, 4(%eax) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, (%eax) -; FALLBACK23-NEXT: addl $204, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: shl_64bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK24-NEXT: movl (%eax), %eax -; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: andl $60, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: subl %edx, %ecx -; FALLBACK24-NEXT: movl (%ecx), %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 4(%ecx), %edx -; FALLBACK24-NEXT: movl %ecx, %ebp -; FALLBACK24-NEXT: shll $3, %eax -; FALLBACK24-NEXT: andl $24, %eax -; FALLBACK24-NEXT: movl %edx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %al, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %esi, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 12(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 8(%ebp), %esi -; FALLBACK24-NEXT: movl %ebp, %edi -; FALLBACK24-NEXT: movl %esi, %ebp -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl 20(%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 16(%edi), %esi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %esi, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %ebp, %edx -; FALLBACK24-NEXT: movl 28(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 24(%ebp), %esi -; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %esi, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 36(%edx), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 32(%edx), %esi -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 44(%ebp), %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 40(%ebp), %esi -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 52(%ebp), %esi -; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: negl %edx -; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %edi, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: movl 60(%edi), %edx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: movl 56(%edi), %ebx -; FALLBACK24-NEXT: movl %ebx, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %edx, (%eax) -; FALLBACK24-NEXT: movl %esi, 56(%eax) -; FALLBACK24-NEXT: movl %edi, 60(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 40(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 44(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 32(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 36(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: shl_64bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $188, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK25-NEXT: movl (%eax), %ecx -; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: subl %ebp, %eax -; FALLBACK25-NEXT: movl 8(%eax), %esi -; FALLBACK25-NEXT: movl 12(%eax), %edx -; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx -; FALLBACK25-NEXT: movl %edx, %edi -; FALLBACK25-NEXT: shldl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 4(%eax), %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 16(%eax), %edi -; FALLBACK25-NEXT: movl 20(%eax), %esi -; FALLBACK25-NEXT: movl %esi, %ebx -; FALLBACK25-NEXT: shldl %cl, %edi, %ebx -; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 24(%eax), %edi -; FALLBACK25-NEXT: movl 28(%eax), %edx -; FALLBACK25-NEXT: movl %edx, %ebx -; FALLBACK25-NEXT: shldl %cl, %edi, %ebx -; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 32(%eax), %edi -; FALLBACK25-NEXT: movl 36(%eax), %esi -; FALLBACK25-NEXT: movl %esi, %ebx -; FALLBACK25-NEXT: shldl %cl, %edi, %ebx -; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 40(%eax), %edx -; FALLBACK25-NEXT: movl 44(%eax), %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 56(%eax), %edx -; FALLBACK25-NEXT: movl 60(%eax), %edi -; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl (%eax), %ebx -; FALLBACK25-NEXT: movl 52(%eax), %esi -; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: negl %ebp -; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl %edi, 60(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %ebx, %edx -; FALLBACK25-NEXT: shll %cl, %ebx -; FALLBACK25-NEXT: shldl %cl, %eax, %esi -; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %edi, %eax -; FALLBACK25-NEXT: movl %eax, 48(%ebp) -; FALLBACK25-NEXT: movl %esi, 52(%ebp) -; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl %edx, 4(%ebp) -; FALLBACK25-NEXT: addl $188, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: shl_64bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $204, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK26-NEXT: movl (%eax), %eax -; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx -; FALLBACK26-NEXT: andl $60, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK26-NEXT: subl %eax, %edi -; FALLBACK26-NEXT: movl (%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 4(%edi), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 8(%edi), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 12(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 16(%edi), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 20(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 24(%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 28(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 32(%edi), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 36(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 40(%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 44(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 52(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK26-NEXT: orl %eax, %ebp -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: negl %eax -; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK26-NEXT: movl 56(%edi), %eax -; FALLBACK26-NEXT: shlxl %edx, %eax, %edx -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edx, %esi -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK26-NEXT: movl %edx, (%eax) -; FALLBACK26-NEXT: movl %esi, 56(%eax) -; FALLBACK26-NEXT: movl %ecx, 60(%eax) -; FALLBACK26-NEXT: movl %ebp, 48(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 40(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 44(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 32(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 36(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 24(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 28(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 16(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 20(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 8(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 12(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 4(%eax) -; FALLBACK26-NEXT: addl $204, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: shl_64bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $204, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK27-NEXT: movl (%eax), %ebx -; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: leal (,%ebx,8), %ecx -; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: andl $60, %ebx -; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: subl %ebx, %eax -; FALLBACK27-NEXT: movl 4(%eax), %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 8(%eax), %edi -; FALLBACK27-NEXT: movl 12(%eax), %edx -; FALLBACK27-NEXT: movl %edx, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %esi, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 16(%eax), %edi -; FALLBACK27-NEXT: movl 20(%eax), %esi -; FALLBACK27-NEXT: movl %esi, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 24(%eax), %edi -; FALLBACK27-NEXT: movl 28(%eax), %edx -; FALLBACK27-NEXT: movl %edx, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %esi, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 32(%eax), %edi -; FALLBACK27-NEXT: movl 36(%eax), %esi -; FALLBACK27-NEXT: movl %esi, %ebp -; FALLBACK27-NEXT: shldl %cl, %edi, %ebp -; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 40(%eax), %ebp -; FALLBACK27-NEXT: movl 44(%eax), %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %ebp, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shldl %cl, %esi, %ebp -; FALLBACK27-NEXT: movl 56(%eax), %edx -; FALLBACK27-NEXT: movl 60(%eax), %edi -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: movl (%eax), %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 52(%eax), %esi -; FALLBACK27-NEXT: shldl %cl, %esi, %edx -; FALLBACK27-NEXT: negl %ebx -; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: movl %edx, 56(%eax) -; FALLBACK27-NEXT: movl %edi, 60(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK27-NEXT: shldl %cl, %edx, %edi -; FALLBACK27-NEXT: shldl %cl, %ebx, %esi -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shldl %cl, %edx, %ebx -; FALLBACK27-NEXT: movl %ebx, 48(%eax) -; FALLBACK27-NEXT: movl %esi, 52(%eax) -; FALLBACK27-NEXT: movl %ebp, 40(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 44(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 32(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 36(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 24(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 28(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 16(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 20(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 8(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 12(%eax) -; FALLBACK27-NEXT: movl %edi, 4(%eax) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, (%eax) -; FALLBACK27-NEXT: addl $204, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: shl_64bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK28-NEXT: movl (%eax), %eax -; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: andl $60, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: subl %edx, %ecx -; FALLBACK28-NEXT: movl (%ecx), %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 4(%ecx), %edx -; FALLBACK28-NEXT: movl %ecx, %ebp -; FALLBACK28-NEXT: shll $3, %eax -; FALLBACK28-NEXT: andl $24, %eax -; FALLBACK28-NEXT: movl %edx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %al, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %esi, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 12(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 8(%ebp), %esi -; FALLBACK28-NEXT: movl %ebp, %edi -; FALLBACK28-NEXT: movl %esi, %ebp -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl 20(%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 16(%edi), %esi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %esi, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %ebp, %edx -; FALLBACK28-NEXT: movl 28(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 24(%ebp), %esi -; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %esi, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 36(%edx), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 32(%edx), %esi -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 44(%ebp), %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 40(%ebp), %esi -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 52(%ebp), %esi -; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: negl %edx -; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %edi, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: movl 60(%edi), %edx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: movl 56(%edi), %ebx -; FALLBACK28-NEXT: movl %ebx, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %edx, (%eax) -; FALLBACK28-NEXT: movl %esi, 56(%eax) -; FALLBACK28-NEXT: movl %edi, 60(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 40(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 44(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 32(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 36(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: shl_64bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $188, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK29-NEXT: movl (%eax), %ecx -; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: subl %ebp, %eax -; FALLBACK29-NEXT: movl 8(%eax), %esi -; FALLBACK29-NEXT: movl 12(%eax), %edx -; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx -; FALLBACK29-NEXT: movl %edx, %edi -; FALLBACK29-NEXT: shldl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 4(%eax), %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 16(%eax), %edi -; FALLBACK29-NEXT: movl 20(%eax), %esi -; FALLBACK29-NEXT: movl %esi, %ebx -; FALLBACK29-NEXT: shldl %cl, %edi, %ebx -; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 24(%eax), %edi -; FALLBACK29-NEXT: movl 28(%eax), %edx -; FALLBACK29-NEXT: movl %edx, %ebx -; FALLBACK29-NEXT: shldl %cl, %edi, %ebx -; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 32(%eax), %edi -; FALLBACK29-NEXT: movl 36(%eax), %esi -; FALLBACK29-NEXT: movl %esi, %ebx -; FALLBACK29-NEXT: shldl %cl, %edi, %ebx -; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 40(%eax), %edx -; FALLBACK29-NEXT: movl 44(%eax), %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 56(%eax), %edx -; FALLBACK29-NEXT: movl 60(%eax), %edi -; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl (%eax), %ebx -; FALLBACK29-NEXT: movl 52(%eax), %esi -; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: negl %ebp -; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl %edi, 60(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %ebx, %edx -; FALLBACK29-NEXT: shll %cl, %ebx -; FALLBACK29-NEXT: shldl %cl, %eax, %esi -; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %edi, %eax -; FALLBACK29-NEXT: movl %eax, 48(%ebp) -; FALLBACK29-NEXT: movl %esi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl %edx, 4(%ebp) -; FALLBACK29-NEXT: addl $188, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: shl_64bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $204, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK30-NEXT: movl (%eax), %eax -; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx -; FALLBACK30-NEXT: andl $60, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK30-NEXT: subl %eax, %edi -; FALLBACK30-NEXT: movl (%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 4(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 8(%edi), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 12(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 16(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 20(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 24(%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 28(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 32(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 36(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 40(%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 44(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 52(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK30-NEXT: orl %eax, %ebp -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: negl %eax -; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK30-NEXT: movl 56(%edi), %eax -; FALLBACK30-NEXT: shlxl %edx, %eax, %edx -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edx, %esi -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK30-NEXT: movl %edx, (%eax) -; FALLBACK30-NEXT: movl %esi, 56(%eax) -; FALLBACK30-NEXT: movl %ecx, 60(%eax) -; FALLBACK30-NEXT: movl %ebp, 48(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 40(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 44(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 32(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 36(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 24(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 28(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 16(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 20(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 8(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 12(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 4(%eax) -; FALLBACK30-NEXT: addl $204, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: shl_64bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $204, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK31-NEXT: movl (%eax), %ebx -; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: leal (,%ebx,8), %ecx -; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: andl $60, %ebx -; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: subl %ebx, %eax -; FALLBACK31-NEXT: movl 4(%eax), %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 8(%eax), %edi -; FALLBACK31-NEXT: movl 12(%eax), %edx -; FALLBACK31-NEXT: movl %edx, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %esi, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 16(%eax), %edi -; FALLBACK31-NEXT: movl 20(%eax), %esi -; FALLBACK31-NEXT: movl %esi, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 24(%eax), %edi -; FALLBACK31-NEXT: movl 28(%eax), %edx -; FALLBACK31-NEXT: movl %edx, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %esi, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 32(%eax), %edi -; FALLBACK31-NEXT: movl 36(%eax), %esi -; FALLBACK31-NEXT: movl %esi, %ebp -; FALLBACK31-NEXT: shldl %cl, %edi, %ebp -; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 40(%eax), %ebp -; FALLBACK31-NEXT: movl 44(%eax), %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %ebp, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shldl %cl, %esi, %ebp -; FALLBACK31-NEXT: movl 56(%eax), %edx -; FALLBACK31-NEXT: movl 60(%eax), %edi -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: movl (%eax), %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 52(%eax), %esi -; FALLBACK31-NEXT: shldl %cl, %esi, %edx -; FALLBACK31-NEXT: negl %ebx -; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: movl %edx, 56(%eax) -; FALLBACK31-NEXT: movl %edi, 60(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK31-NEXT: shldl %cl, %edx, %edi -; FALLBACK31-NEXT: shldl %cl, %ebx, %esi -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shldl %cl, %edx, %ebx -; FALLBACK31-NEXT: movl %ebx, 48(%eax) -; FALLBACK31-NEXT: movl %esi, 52(%eax) -; FALLBACK31-NEXT: movl %ebp, 40(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 44(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 32(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 36(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 24(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 28(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 16(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 20(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 8(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 12(%eax) -; FALLBACK31-NEXT: movl %edi, 4(%eax) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, (%eax) -; FALLBACK31-NEXT: addl $204, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negl %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movslq %esi, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rbx), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rbx), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rbx), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rbx), %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rbx), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rbx), %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%rbx), %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rbx), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r13, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negl %esi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movslq %esi, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r9), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r9), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r11, %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r10, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r9), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r9), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r10, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negl %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movslq %esi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rbx, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r15, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r12, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rbx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negl %esi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movslq %esi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%r8), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%r8), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%r8), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%r8), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%r8), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r11, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r9, %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rbx, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negl %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movslq %ecx, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r9), %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r15, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -8(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negl %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negl %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movslq %esi, %rsi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r10, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r11, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r12, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r15, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negl %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: negl %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movslq %ecx, %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -24(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -32(%rsp,%r9), %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -40(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r11, %r10 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -48(%rsp,%r9), %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -64(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -56(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r15, %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -16(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %rdi +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -8(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r9, %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: negl %eax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: negl %esi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movslq %esi, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -24(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -32(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -40(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r9, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -48(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r10, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r11, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -64(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -56(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rbx, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -16(%rsp,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r15, %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r12, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r15, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: negl %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movslq %ecx, %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r9), %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r9), %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r9), %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r11, %r10 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r9), %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r9), %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r15, %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r9), %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %rdi +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r9), %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r9, %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %eax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %esi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %esi, %rsi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%rsi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rdi, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%rsi), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r8, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%rsi), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r9, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%rsi), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r10, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r11, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%rsi), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%rsi), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rbx, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%rsi), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r15, %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r12, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r15, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rcx, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %eax, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%edx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 176(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%edi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 160(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebp,8), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 60(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 52(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 40(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%ebp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebp,8), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 176(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebp, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%edi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%edx), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 176(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%edi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 160(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%eax,8), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl %eax, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %esi, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %esi, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 60(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 52(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 40(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%ebp,8), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 176(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebp, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX1: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 12(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 8(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 20(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 16(%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 28(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 24(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 36(%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 32(%edx), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 44(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 40(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: negl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 176(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%edi), %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 4(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 40(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 44(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: negl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 160(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%eax,8), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl %eax, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 4(%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 8(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 12(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 16(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 20(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 24(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 28(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 32(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 36(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 40(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 44(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %eax, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %esi, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %esi, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: negl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 56(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 60(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 48(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 52(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 40(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%ebx,8), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl %ebx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 4(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 8(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 40(%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 44(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %ebp, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: negl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 176(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 56(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 60(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 48(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 52(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, 40(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%edi), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%edx), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%ebp), %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 176(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%edi), %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl %ebp, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 160(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %ebx, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%eax,8), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl %eax, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%edx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %bl +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%edx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%edx), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%edx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ecx, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%edx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 60(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 52(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 40(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%edx) +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ebx,8), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl %ebx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebp, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 176(%esp,%ebx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 48(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, (%eax) +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 @@ -20159,4099 +17857,3115 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou } define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; FALLBACK0-LABEL: ashr_64bytes: -; FALLBACK0: # %bb.0: -; FALLBACK0-NEXT: pushq %r15 -; FALLBACK0-NEXT: pushq %r14 -; FALLBACK0-NEXT: pushq %r13 -; FALLBACK0-NEXT: pushq %r12 -; FALLBACK0-NEXT: pushq %rbx -; FALLBACK0-NEXT: movq (%rdi), %rax -; FALLBACK0-NEXT: movq 8(%rdi), %rcx -; FALLBACK0-NEXT: movq 16(%rdi), %r8 -; FALLBACK0-NEXT: movq 24(%rdi), %r9 -; FALLBACK0-NEXT: movq 32(%rdi), %r10 -; FALLBACK0-NEXT: movq 40(%rdi), %r11 -; FALLBACK0-NEXT: movq 48(%rdi), %rbx -; FALLBACK0-NEXT: movq 56(%rdi), %r14 -; FALLBACK0-NEXT: movl (%rsi), %edi -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: sarq $63, %r14 -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK0-NEXT: leal (,%rdi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax -; FALLBACK0-NEXT: andl $56, %edi -; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 -; FALLBACK0-NEXT: movq %r8, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r11 -; FALLBACK0-NEXT: movl %eax, %esi -; FALLBACK0-NEXT: notb %sil -; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r9 -; FALLBACK0-NEXT: orq %r11, %r9 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r10 -; FALLBACK0-NEXT: addq %r8, %r8 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r8 -; FALLBACK0-NEXT: orq %r10, %r8 -; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK0-NEXT: movq %r10, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r15 -; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 -; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r11 -; FALLBACK0-NEXT: orq %r15, %r11 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %rbx -; FALLBACK0-NEXT: addq %r10, %r10 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r10 -; FALLBACK0-NEXT: orq %rbx, %r10 -; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx -; FALLBACK0-NEXT: movq %rbx, %r12 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r12 -; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 -; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r15 -; FALLBACK0-NEXT: orq %r12, %r15 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r14 -; FALLBACK0-NEXT: addq %rbx, %rbx -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %rbx -; FALLBACK0-NEXT: orq %r14, %rbx -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: shrq %cl, %r13 -; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 -; FALLBACK0-NEXT: movl %esi, %ecx -; FALLBACK0-NEXT: shlq %cl, %r14 -; FALLBACK0-NEXT: orq %r13, %r14 -; FALLBACK0-NEXT: movl %eax, %ecx -; FALLBACK0-NEXT: sarq %cl, %rdi -; FALLBACK0-NEXT: movq %rdi, 56(%rdx) -; FALLBACK0-NEXT: movq %r14, 48(%rdx) -; FALLBACK0-NEXT: movq %rbx, 32(%rdx) -; FALLBACK0-NEXT: movq %r15, 40(%rdx) -; FALLBACK0-NEXT: movq %r10, 16(%rdx) -; FALLBACK0-NEXT: movq %r11, 24(%rdx) -; FALLBACK0-NEXT: movq %r8, (%rdx) -; FALLBACK0-NEXT: movq %r9, 8(%rdx) -; FALLBACK0-NEXT: popq %rbx -; FALLBACK0-NEXT: popq %r12 -; FALLBACK0-NEXT: popq %r13 -; FALLBACK0-NEXT: popq %r14 -; FALLBACK0-NEXT: popq %r15 -; FALLBACK0-NEXT: retq -; -; FALLBACK1-LABEL: ashr_64bytes: -; FALLBACK1: # %bb.0: -; FALLBACK1-NEXT: pushq %r15 -; FALLBACK1-NEXT: pushq %r14 -; FALLBACK1-NEXT: pushq %rbx -; FALLBACK1-NEXT: movq (%rdi), %rcx -; FALLBACK1-NEXT: movq 8(%rdi), %r8 -; FALLBACK1-NEXT: movq 16(%rdi), %r9 -; FALLBACK1-NEXT: movq 24(%rdi), %r10 -; FALLBACK1-NEXT: movq 32(%rdi), %r11 -; FALLBACK1-NEXT: movq 40(%rdi), %rbx -; FALLBACK1-NEXT: movq 48(%rdi), %r14 -; FALLBACK1-NEXT: movq 56(%rdi), %rdi -; FALLBACK1-NEXT: movl (%rsi), %eax -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: sarq $63, %rdi -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK1-NEXT: leal (,%rax,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx -; FALLBACK1-NEXT: andl $56, %eax -; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK1-NEXT: movq %r9, %r8 -; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq %r11, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx -; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK1-NEXT: movq %r14, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK1-NEXT: sarq %cl, %rax -; FALLBACK1-NEXT: movq %r11, 48(%rdx) -; FALLBACK1-NEXT: movq %rax, 56(%rdx) -; FALLBACK1-NEXT: movq %r10, 32(%rdx) -; FALLBACK1-NEXT: movq %r15, 40(%rdx) -; FALLBACK1-NEXT: movq %rdi, 16(%rdx) -; FALLBACK1-NEXT: movq %rbx, 24(%rdx) -; FALLBACK1-NEXT: movq %rsi, (%rdx) -; FALLBACK1-NEXT: movq %r8, 8(%rdx) -; FALLBACK1-NEXT: popq %rbx -; FALLBACK1-NEXT: popq %r14 -; FALLBACK1-NEXT: popq %r15 -; FALLBACK1-NEXT: retq -; -; FALLBACK2-LABEL: ashr_64bytes: -; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp -; FALLBACK2-NEXT: pushq %r15 -; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 -; FALLBACK2-NEXT: pushq %r12 -; FALLBACK2-NEXT: pushq %rbx -; FALLBACK2-NEXT: pushq %rax -; FALLBACK2-NEXT: movq (%rdi), %rcx -; FALLBACK2-NEXT: movq 8(%rdi), %r8 -; FALLBACK2-NEXT: movq 16(%rdi), %r9 -; FALLBACK2-NEXT: movq 24(%rdi), %r10 -; FALLBACK2-NEXT: movq 32(%rdi), %r11 -; FALLBACK2-NEXT: movq 40(%rdi), %rbx -; FALLBACK2-NEXT: movq 48(%rdi), %r14 -; FALLBACK2-NEXT: movq 56(%rdi), %rdi -; FALLBACK2-NEXT: movl (%rsi), %eax -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: sarq $63, %rdi -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK2-NEXT: leal (,%rax,8), %ecx -; FALLBACK2-NEXT: andl $56, %ecx -; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 -; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx -; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) -; FALLBACK2-NEXT: addq $8, %rsp -; FALLBACK2-NEXT: popq %rbx -; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 -; FALLBACK2-NEXT: popq %r14 -; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp -; FALLBACK2-NEXT: retq -; -; FALLBACK3-LABEL: ashr_64bytes: -; FALLBACK3: # %bb.0: -; FALLBACK3-NEXT: pushq %r15 -; FALLBACK3-NEXT: pushq %r14 -; FALLBACK3-NEXT: pushq %rbx -; FALLBACK3-NEXT: movq (%rdi), %rcx -; FALLBACK3-NEXT: movq 8(%rdi), %r8 -; FALLBACK3-NEXT: movq 16(%rdi), %r9 -; FALLBACK3-NEXT: movq 24(%rdi), %r10 -; FALLBACK3-NEXT: movq 32(%rdi), %r11 -; FALLBACK3-NEXT: movq 40(%rdi), %rbx -; FALLBACK3-NEXT: movq 48(%rdi), %r14 -; FALLBACK3-NEXT: movq 56(%rdi), %rdi -; FALLBACK3-NEXT: movl (%rsi), %eax -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: sarq $63, %rdi -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK3-NEXT: leal (,%rax,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx -; FALLBACK3-NEXT: andl $56, %eax -; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi -; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi -; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 -; FALLBACK3-NEXT: movq %r9, %r8 -; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 -; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq %r11, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx -; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 -; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK3-NEXT: movq %r14, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 -; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 -; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi -; FALLBACK3-NEXT: movq %r11, 48(%rdx) -; FALLBACK3-NEXT: movq %r10, 32(%rdx) -; FALLBACK3-NEXT: movq %r15, 40(%rdx) -; FALLBACK3-NEXT: movq %rdi, 16(%rdx) -; FALLBACK3-NEXT: movq %rbx, 24(%rdx) -; FALLBACK3-NEXT: movq %rsi, (%rdx) -; FALLBACK3-NEXT: movq %r8, 8(%rdx) -; FALLBACK3-NEXT: movq %rax, 56(%rdx) -; FALLBACK3-NEXT: popq %rbx -; FALLBACK3-NEXT: popq %r14 -; FALLBACK3-NEXT: popq %r15 -; FALLBACK3-NEXT: retq -; -; FALLBACK4-LABEL: ashr_64bytes: -; FALLBACK4: # %bb.0: -; FALLBACK4-NEXT: pushq %rbp -; FALLBACK4-NEXT: pushq %r15 -; FALLBACK4-NEXT: pushq %r14 -; FALLBACK4-NEXT: pushq %r13 -; FALLBACK4-NEXT: pushq %r12 -; FALLBACK4-NEXT: pushq %rbx -; FALLBACK4-NEXT: pushq %rax -; FALLBACK4-NEXT: movups (%rdi), %xmm0 -; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK4-NEXT: movq 48(%rdi), %rax -; FALLBACK4-NEXT: movq 56(%rdi), %rcx -; FALLBACK4-NEXT: movl (%rsi), %edi -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: sarq $63, %rcx -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK4-NEXT: leal (,%rdi,8), %eax -; FALLBACK4-NEXT: andl $56, %eax -; FALLBACK4-NEXT: andl $56, %edi -; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r10 -; FALLBACK4-NEXT: movl %eax, %esi -; FALLBACK4-NEXT: notb %sil -; FALLBACK4-NEXT: leaq (%r9,%r9), %r8 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r8 -; FALLBACK4-NEXT: orq %r10, %r8 -; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK4-NEXT: movq %r10, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbx -; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r11 -; FALLBACK4-NEXT: orq %rbx, %r11 -; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK4-NEXT: movq %rbx, %r14 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r14 -; FALLBACK4-NEXT: addq %r10, %r10 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r10 -; FALLBACK4-NEXT: orq %r14, %r10 -; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK4-NEXT: movq %r14, %r13 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r13 -; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r15 -; FALLBACK4-NEXT: orq %r13, %r15 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r12 -; FALLBACK4-NEXT: addq %r14, %r14 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r14 -; FALLBACK4-NEXT: orq %r12, %r14 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %rbp -; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12 -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %r12 -; FALLBACK4-NEXT: orq %rbp, %r12 -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: shrq %cl, %r9 -; FALLBACK4-NEXT: addq %rbx, %rbx -; FALLBACK4-NEXT: movl %esi, %ecx -; FALLBACK4-NEXT: shlq %cl, %rbx -; FALLBACK4-NEXT: orq %r9, %rbx -; FALLBACK4-NEXT: movl %eax, %ecx -; FALLBACK4-NEXT: sarq %cl, %rdi -; FALLBACK4-NEXT: movq %rdi, 56(%rdx) -; FALLBACK4-NEXT: movq %rbx, 8(%rdx) -; FALLBACK4-NEXT: movq %r12, 48(%rdx) -; FALLBACK4-NEXT: movq %r14, 32(%rdx) -; FALLBACK4-NEXT: movq %r15, 40(%rdx) -; FALLBACK4-NEXT: movq %r10, 16(%rdx) -; FALLBACK4-NEXT: movq %r11, 24(%rdx) -; FALLBACK4-NEXT: movq %r8, (%rdx) -; FALLBACK4-NEXT: addq $8, %rsp -; FALLBACK4-NEXT: popq %rbx -; FALLBACK4-NEXT: popq %r12 -; FALLBACK4-NEXT: popq %r13 -; FALLBACK4-NEXT: popq %r14 -; FALLBACK4-NEXT: popq %r15 -; FALLBACK4-NEXT: popq %rbp -; FALLBACK4-NEXT: retq -; -; FALLBACK5-LABEL: ashr_64bytes: -; FALLBACK5: # %bb.0: -; FALLBACK5-NEXT: pushq %r15 -; FALLBACK5-NEXT: pushq %r14 -; FALLBACK5-NEXT: pushq %rbx -; FALLBACK5-NEXT: movups (%rdi), %xmm0 -; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK5-NEXT: movq 48(%rdi), %rcx -; FALLBACK5-NEXT: movq 56(%rdi), %rdi -; FALLBACK5-NEXT: movl (%rsi), %eax -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: sarq $63, %rdi -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx -; FALLBACK5-NEXT: andl $56, %eax -; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq %r9, %rsi -; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK5-NEXT: movq %r10, %r8 -; FALLBACK5-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK5-NEXT: movq %r11, %rbx -; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK5-NEXT: movq %rax, %r15 -; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK5-NEXT: sarq %cl, %r11 -; FALLBACK5-NEXT: movq %r15, 8(%rdx) -; FALLBACK5-NEXT: movq %r9, 48(%rdx) -; FALLBACK5-NEXT: movq %r11, 56(%rdx) -; FALLBACK5-NEXT: movq %rdi, 32(%rdx) -; FALLBACK5-NEXT: movq %rbx, 40(%rdx) -; FALLBACK5-NEXT: movq %r8, 16(%rdx) -; FALLBACK5-NEXT: movq %rsi, 24(%rdx) -; FALLBACK5-NEXT: movq %r14, (%rdx) -; FALLBACK5-NEXT: popq %rbx -; FALLBACK5-NEXT: popq %r14 -; FALLBACK5-NEXT: popq %r15 -; FALLBACK5-NEXT: retq -; -; FALLBACK6-LABEL: ashr_64bytes: -; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp -; FALLBACK6-NEXT: pushq %r15 -; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 -; FALLBACK6-NEXT: pushq %r12 -; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax -; FALLBACK6-NEXT: movups (%rdi), %xmm0 -; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK6-NEXT: movq 48(%rdi), %rcx -; FALLBACK6-NEXT: movq 56(%rdi), %rdi -; FALLBACK6-NEXT: movl (%rsi), %eax -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: sarq $63, %rdi -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi -; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 -; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) -; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp -; FALLBACK6-NEXT: popq %rbx -; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 -; FALLBACK6-NEXT: popq %r14 -; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp -; FALLBACK6-NEXT: retq -; -; FALLBACK7-LABEL: ashr_64bytes: -; FALLBACK7: # %bb.0: -; FALLBACK7-NEXT: pushq %r15 -; FALLBACK7-NEXT: pushq %r14 -; FALLBACK7-NEXT: pushq %rbx -; FALLBACK7-NEXT: movups (%rdi), %xmm0 -; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 -; FALLBACK7-NEXT: movq 48(%rdi), %rcx -; FALLBACK7-NEXT: movq 56(%rdi), %rdi -; FALLBACK7-NEXT: movl (%rsi), %eax -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: sarq $63, %rdi -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx -; FALLBACK7-NEXT: andl $56, %eax -; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq %r9, %rsi -; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK7-NEXT: movq %r10, %r8 -; FALLBACK7-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK7-NEXT: movq %r11, %rbx -; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK7-NEXT: movq %rax, %r15 -; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK7-NEXT: movq %r15, 8(%rdx) -; FALLBACK7-NEXT: movq %r9, 48(%rdx) -; FALLBACK7-NEXT: movq %rdi, 32(%rdx) -; FALLBACK7-NEXT: movq %rbx, 40(%rdx) -; FALLBACK7-NEXT: movq %r8, 16(%rdx) -; FALLBACK7-NEXT: movq %rsi, 24(%rdx) -; FALLBACK7-NEXT: movq %r14, (%rdx) -; FALLBACK7-NEXT: movq %r10, 56(%rdx) -; FALLBACK7-NEXT: popq %rbx -; FALLBACK7-NEXT: popq %r14 -; FALLBACK7-NEXT: popq %r15 -; FALLBACK7-NEXT: retq -; -; FALLBACK8-LABEL: ashr_64bytes: -; FALLBACK8: # %bb.0: -; FALLBACK8-NEXT: pushq %rbp -; FALLBACK8-NEXT: pushq %r15 -; FALLBACK8-NEXT: pushq %r14 -; FALLBACK8-NEXT: pushq %r13 -; FALLBACK8-NEXT: pushq %r12 -; FALLBACK8-NEXT: pushq %rbx -; FALLBACK8-NEXT: pushq %rax -; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK8-NEXT: movq 48(%rdi), %rax -; FALLBACK8-NEXT: movq 56(%rdi), %rcx -; FALLBACK8-NEXT: movl (%rsi), %edi -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: sarq $63, %rcx -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK8-NEXT: leal (,%rdi,8), %eax -; FALLBACK8-NEXT: andl $56, %eax -; FALLBACK8-NEXT: andl $56, %edi -; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r10 -; FALLBACK8-NEXT: movl %eax, %esi -; FALLBACK8-NEXT: notb %sil -; FALLBACK8-NEXT: leaq (%r9,%r9), %r8 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r8 -; FALLBACK8-NEXT: orq %r10, %r8 -; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK8-NEXT: movq %r10, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbx -; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r11 -; FALLBACK8-NEXT: orq %rbx, %r11 -; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK8-NEXT: movq %rbx, %r14 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r14 -; FALLBACK8-NEXT: addq %r10, %r10 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r10 -; FALLBACK8-NEXT: orq %r14, %r10 -; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK8-NEXT: movq %r14, %r13 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r13 -; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r15 -; FALLBACK8-NEXT: orq %r13, %r15 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r12 -; FALLBACK8-NEXT: addq %r14, %r14 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r14 -; FALLBACK8-NEXT: orq %r12, %r14 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %rbp -; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12 -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %r12 -; FALLBACK8-NEXT: orq %rbp, %r12 -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: shrq %cl, %r9 -; FALLBACK8-NEXT: addq %rbx, %rbx -; FALLBACK8-NEXT: movl %esi, %ecx -; FALLBACK8-NEXT: shlq %cl, %rbx -; FALLBACK8-NEXT: orq %r9, %rbx -; FALLBACK8-NEXT: movl %eax, %ecx -; FALLBACK8-NEXT: sarq %cl, %rdi -; FALLBACK8-NEXT: movq %rdi, 56(%rdx) -; FALLBACK8-NEXT: movq %rbx, 8(%rdx) -; FALLBACK8-NEXT: movq %r12, 48(%rdx) -; FALLBACK8-NEXT: movq %r14, 32(%rdx) -; FALLBACK8-NEXT: movq %r15, 40(%rdx) -; FALLBACK8-NEXT: movq %r10, 16(%rdx) -; FALLBACK8-NEXT: movq %r11, 24(%rdx) -; FALLBACK8-NEXT: movq %r8, (%rdx) -; FALLBACK8-NEXT: addq $8, %rsp -; FALLBACK8-NEXT: popq %rbx -; FALLBACK8-NEXT: popq %r12 -; FALLBACK8-NEXT: popq %r13 -; FALLBACK8-NEXT: popq %r14 -; FALLBACK8-NEXT: popq %r15 -; FALLBACK8-NEXT: popq %rbp -; FALLBACK8-NEXT: vzeroupper -; FALLBACK8-NEXT: retq -; -; FALLBACK9-LABEL: ashr_64bytes: -; FALLBACK9: # %bb.0: -; FALLBACK9-NEXT: pushq %r15 -; FALLBACK9-NEXT: pushq %r14 -; FALLBACK9-NEXT: pushq %rbx -; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK9-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK9-NEXT: movq 48(%rdi), %rcx -; FALLBACK9-NEXT: movq 56(%rdi), %rdi -; FALLBACK9-NEXT: movl (%rsi), %eax -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: sarq $63, %rdi -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK9-NEXT: movq %r10, %r8 -; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK9-NEXT: movq %r11, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 -; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK9-NEXT: sarq %cl, %r11 -; FALLBACK9-NEXT: movq %r15, 8(%rdx) -; FALLBACK9-NEXT: movq %r9, 48(%rdx) -; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) -; FALLBACK9-NEXT: movq %rbx, 40(%rdx) -; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) -; FALLBACK9-NEXT: movq %r14, (%rdx) -; FALLBACK9-NEXT: popq %rbx -; FALLBACK9-NEXT: popq %r14 -; FALLBACK9-NEXT: popq %r15 -; FALLBACK9-NEXT: vzeroupper -; FALLBACK9-NEXT: retq -; -; FALLBACK10-LABEL: ashr_64bytes: -; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp -; FALLBACK10-NEXT: pushq %r15 -; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 -; FALLBACK10-NEXT: pushq %r12 -; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax -; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK10-NEXT: movq 48(%rdi), %rcx -; FALLBACK10-NEXT: movq 56(%rdi), %rdi -; FALLBACK10-NEXT: movl (%rsi), %eax -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: sarq $63, %rdi -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi -; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 -; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp -; FALLBACK10-NEXT: popq %rbx -; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 -; FALLBACK10-NEXT: popq %r14 -; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp -; FALLBACK10-NEXT: vzeroupper -; FALLBACK10-NEXT: retq -; -; FALLBACK11-LABEL: ashr_64bytes: -; FALLBACK11: # %bb.0: -; FALLBACK11-NEXT: pushq %r15 -; FALLBACK11-NEXT: pushq %r14 -; FALLBACK11-NEXT: pushq %rbx -; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK11-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK11-NEXT: movq 48(%rdi), %rcx -; FALLBACK11-NEXT: movq 56(%rdi), %rdi -; FALLBACK11-NEXT: movl (%rsi), %eax -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: sarq $63, %rdi -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx -; FALLBACK11-NEXT: andl $56, %eax -; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq %r9, %rsi -; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK11-NEXT: movq %r10, %r8 -; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK11-NEXT: movq %r11, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r15 -; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK11-NEXT: movq %r15, 8(%rdx) -; FALLBACK11-NEXT: movq %r9, 48(%rdx) -; FALLBACK11-NEXT: movq %rdi, 32(%rdx) -; FALLBACK11-NEXT: movq %rbx, 40(%rdx) -; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rsi, 24(%rdx) -; FALLBACK11-NEXT: movq %r14, (%rdx) -; FALLBACK11-NEXT: movq %r10, 56(%rdx) -; FALLBACK11-NEXT: popq %rbx -; FALLBACK11-NEXT: popq %r14 -; FALLBACK11-NEXT: popq %r15 -; FALLBACK11-NEXT: vzeroupper -; FALLBACK11-NEXT: retq -; -; FALLBACK12-LABEL: ashr_64bytes: -; FALLBACK12: # %bb.0: -; FALLBACK12-NEXT: pushq %rbp -; FALLBACK12-NEXT: pushq %r15 -; FALLBACK12-NEXT: pushq %r14 -; FALLBACK12-NEXT: pushq %r13 -; FALLBACK12-NEXT: pushq %r12 -; FALLBACK12-NEXT: pushq %rbx -; FALLBACK12-NEXT: pushq %rax -; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK12-NEXT: movq 48(%rdi), %rax -; FALLBACK12-NEXT: movq 56(%rdi), %rcx -; FALLBACK12-NEXT: movl (%rsi), %edi -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: sarq $63, %rcx -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%rdi,8), %eax -; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %edi -; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 -; FALLBACK12-NEXT: movl %eax, %esi -; FALLBACK12-NEXT: notb %sil -; FALLBACK12-NEXT: leaq (%r9,%r9), %r8 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r8 -; FALLBACK12-NEXT: orq %r10, %r8 -; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12 -; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r11 -; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %rbx -; FALLBACK12-NEXT: movq %rbx, %r14 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14 -; FALLBACK12-NEXT: movq %r14, %r13 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp -; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r15 -; FALLBACK12-NEXT: orq %r13, %r15 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r12 -; FALLBACK12-NEXT: addq %r14, %r14 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r14 -; FALLBACK12-NEXT: orq %r12, %r14 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi -; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12 -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r12 -; FALLBACK12-NEXT: orq %rbp, %r12 -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: addq %rbx, %rbx -; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %rbx -; FALLBACK12-NEXT: orq %r9, %rbx -; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: sarq %cl, %rdi -; FALLBACK12-NEXT: movq %rdi, 56(%rdx) -; FALLBACK12-NEXT: movq %rbx, 8(%rdx) -; FALLBACK12-NEXT: movq %r12, 48(%rdx) -; FALLBACK12-NEXT: movq %r14, 32(%rdx) -; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) -; FALLBACK12-NEXT: movq %r11, 24(%rdx) -; FALLBACK12-NEXT: movq %r8, (%rdx) -; FALLBACK12-NEXT: addq $8, %rsp -; FALLBACK12-NEXT: popq %rbx -; FALLBACK12-NEXT: popq %r12 -; FALLBACK12-NEXT: popq %r13 -; FALLBACK12-NEXT: popq %r14 -; FALLBACK12-NEXT: popq %r15 -; FALLBACK12-NEXT: popq %rbp -; FALLBACK12-NEXT: vzeroupper -; FALLBACK12-NEXT: retq -; -; FALLBACK13-LABEL: ashr_64bytes: -; FALLBACK13: # %bb.0: -; FALLBACK13-NEXT: pushq %r15 -; FALLBACK13-NEXT: pushq %r14 -; FALLBACK13-NEXT: pushq %rbx -; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK13-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK13-NEXT: movq 48(%rdi), %rcx -; FALLBACK13-NEXT: movq 56(%rdi), %rdi -; FALLBACK13-NEXT: movl (%rsi), %eax -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: sarq $63, %rdi -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx -; FALLBACK13-NEXT: andl $56, %eax -; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq %r9, %rsi -; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK13-NEXT: movq %r10, %r8 -; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK13-NEXT: movq %r11, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK13-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK13-NEXT: movq %rax, %r15 -; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK13-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK13-NEXT: sarq %cl, %r11 -; FALLBACK13-NEXT: movq %r15, 8(%rdx) -; FALLBACK13-NEXT: movq %r9, 48(%rdx) -; FALLBACK13-NEXT: movq %r11, 56(%rdx) -; FALLBACK13-NEXT: movq %rdi, 32(%rdx) -; FALLBACK13-NEXT: movq %rbx, 40(%rdx) -; FALLBACK13-NEXT: movq %r8, 16(%rdx) -; FALLBACK13-NEXT: movq %rsi, 24(%rdx) -; FALLBACK13-NEXT: movq %r14, (%rdx) -; FALLBACK13-NEXT: popq %rbx -; FALLBACK13-NEXT: popq %r14 -; FALLBACK13-NEXT: popq %r15 -; FALLBACK13-NEXT: vzeroupper -; FALLBACK13-NEXT: retq -; -; FALLBACK14-LABEL: ashr_64bytes: -; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp -; FALLBACK14-NEXT: pushq %r15 -; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 -; FALLBACK14-NEXT: pushq %r12 -; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax -; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK14-NEXT: movq 48(%rdi), %rcx -; FALLBACK14-NEXT: movq 56(%rdi), %rdi -; FALLBACK14-NEXT: movl (%rsi), %eax -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: sarq $63, %rdi -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %esi -; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK14-NEXT: movl %esi, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 -; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %r13, %rax -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK14-NEXT: orq %rbp, %rcx -; FALLBACK14-NEXT: movq %rsi, 56(%rdx) -; FALLBACK14-NEXT: movq %rcx, 8(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp -; FALLBACK14-NEXT: popq %rbx -; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 -; FALLBACK14-NEXT: popq %r14 -; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp -; FALLBACK14-NEXT: vzeroupper -; FALLBACK14-NEXT: retq -; -; FALLBACK15-LABEL: ashr_64bytes: -; FALLBACK15: # %bb.0: -; FALLBACK15-NEXT: pushq %r15 -; FALLBACK15-NEXT: pushq %r14 -; FALLBACK15-NEXT: pushq %rbx -; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK15-NEXT: vmovups 32(%rdi), %xmm1 -; FALLBACK15-NEXT: movq 48(%rdi), %rcx -; FALLBACK15-NEXT: movq 56(%rdi), %rdi -; FALLBACK15-NEXT: movl (%rsi), %eax -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: sarq $63, %rdi -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 -; FALLBACK15-NEXT: movq %r10, %r8 -; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 -; FALLBACK15-NEXT: movq %r11, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 -; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 -; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 -; FALLBACK15-NEXT: movq %r15, 8(%rdx) -; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) -; FALLBACK15-NEXT: movq %rbx, 40(%rdx) -; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) -; FALLBACK15-NEXT: movq %r14, (%rdx) -; FALLBACK15-NEXT: movq %r10, 56(%rdx) -; FALLBACK15-NEXT: popq %rbx -; FALLBACK15-NEXT: popq %r14 -; FALLBACK15-NEXT: popq %r15 -; FALLBACK15-NEXT: vzeroupper -; FALLBACK15-NEXT: retq -; -; FALLBACK16-LABEL: ashr_64bytes: -; FALLBACK16: # %bb.0: -; FALLBACK16-NEXT: pushl %ebp -; FALLBACK16-NEXT: pushl %ebx -; FALLBACK16-NEXT: pushl %edi -; FALLBACK16-NEXT: pushl %esi -; FALLBACK16-NEXT: subl $204, %esp -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: movl (%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 8(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 16(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 20(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 24(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 28(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 32(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%ecx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%ecx), %ebx -; FALLBACK16-NEXT: movl 44(%ecx), %edi -; FALLBACK16-NEXT: movl 48(%ecx), %esi -; FALLBACK16-NEXT: movl 52(%ecx), %edx -; FALLBACK16-NEXT: movl 56(%ecx), %eax -; FALLBACK16-NEXT: movl 60(%ecx), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK16-NEXT: movl (%ebp), %ebp -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: sarl $31, %ecx -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, %ecx -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: andl $60, %esi -; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK16-NEXT: shll $3, %ecx -; FALLBACK16-NEXT: andl $24, %ecx -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: movl %ecx, %ebx -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %eax, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %edx, %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %edx, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %edx, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %ebp, %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %esi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebx, %edx -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: addl %ebx, %ebx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi -; FALLBACK16-NEXT: movl %edi, %eax -; FALLBACK16-NEXT: movl %edx, %ebx -; FALLBACK16-NEXT: movl %ebx, %ecx -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %esi -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi -; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: movl %ebx, %ecx -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx -; FALLBACK16-NEXT: leal (%edx,%edx), %ebp -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %esi, %esi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK16-NEXT: sarl %cl, %ebx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %ebx, 60(%eax) -; FALLBACK16-NEXT: movl %edx, 56(%eax) -; FALLBACK16-NEXT: movl %esi, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) -; FALLBACK16-NEXT: movl %edi, 40(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 44(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 32(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 36(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 24(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, (%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) -; FALLBACK16-NEXT: addl $204, %esp -; FALLBACK16-NEXT: popl %esi -; FALLBACK16-NEXT: popl %edi -; FALLBACK16-NEXT: popl %ebx -; FALLBACK16-NEXT: popl %ebp -; FALLBACK16-NEXT: retl -; -; FALLBACK17-LABEL: ashr_64bytes: -; FALLBACK17: # %bb.0: -; FALLBACK17-NEXT: pushl %ebp -; FALLBACK17-NEXT: pushl %ebx -; FALLBACK17-NEXT: pushl %edi -; FALLBACK17-NEXT: pushl %esi -; FALLBACK17-NEXT: subl $188, %esp -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: movl (%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 4(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 8(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 12(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 16(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 20(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 24(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 28(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 32(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 36(%eax), %ecx -; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 40(%eax), %ebp -; FALLBACK17-NEXT: movl 44(%eax), %ebx -; FALLBACK17-NEXT: movl 48(%eax), %edi -; FALLBACK17-NEXT: movl 52(%eax), %esi -; FALLBACK17-NEXT: movl 56(%eax), %edx -; FALLBACK17-NEXT: movl 60(%eax), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK17-NEXT: movl (%ecx), %ecx -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: sarl $31, %eax -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %edi, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %edi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl %esi, %edx -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: sarl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 60(%ebp) -; FALLBACK17-NEXT: movl %esi, 48(%ebp) -; FALLBACK17-NEXT: movl %edi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) -; FALLBACK17-NEXT: addl $188, %esp -; FALLBACK17-NEXT: popl %esi -; FALLBACK17-NEXT: popl %edi -; FALLBACK17-NEXT: popl %ebx -; FALLBACK17-NEXT: popl %ebp -; FALLBACK17-NEXT: retl -; -; FALLBACK18-LABEL: ashr_64bytes: -; FALLBACK18: # %bb.0: -; FALLBACK18-NEXT: pushl %ebp -; FALLBACK18-NEXT: pushl %ebx -; FALLBACK18-NEXT: pushl %edi -; FALLBACK18-NEXT: pushl %esi -; FALLBACK18-NEXT: subl $204, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 12(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 20(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 28(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 36(%eax), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx -; FALLBACK18-NEXT: movl 48(%eax), %edi -; FALLBACK18-NEXT: movl 52(%eax), %esi -; FALLBACK18-NEXT: movl 56(%eax), %edx -; FALLBACK18-NEXT: movl 60(%eax), %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %eax -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: sarl $31, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: leal (,%eax,8), %edx -; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: andl $60, %ecx -; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl -; FALLBACK18-NEXT: leal (%edi,%edi), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: orl %edi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) -; FALLBACK18-NEXT: addl $204, %esp -; FALLBACK18-NEXT: popl %esi -; FALLBACK18-NEXT: popl %edi -; FALLBACK18-NEXT: popl %ebx -; FALLBACK18-NEXT: popl %ebp -; FALLBACK18-NEXT: retl -; -; FALLBACK19-LABEL: ashr_64bytes: -; FALLBACK19: # %bb.0: -; FALLBACK19-NEXT: pushl %ebp -; FALLBACK19-NEXT: pushl %ebx -; FALLBACK19-NEXT: pushl %edi -; FALLBACK19-NEXT: pushl %esi -; FALLBACK19-NEXT: subl $188, %esp -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK19-NEXT: movl (%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 4(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 8(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 12(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 16(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 20(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 24(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 28(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 32(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 36(%eax), %ecx -; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 40(%eax), %ebp -; FALLBACK19-NEXT: movl 44(%eax), %ebx -; FALLBACK19-NEXT: movl 48(%eax), %edi -; FALLBACK19-NEXT: movl 52(%eax), %esi -; FALLBACK19-NEXT: movl 56(%eax), %edx -; FALLBACK19-NEXT: movl 60(%eax), %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK19-NEXT: movl (%ecx), %ecx -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: sarl $31, %eax -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK19-NEXT: movl %ecx, %ebp -; FALLBACK19-NEXT: andl $60, %ebp -; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shll $3, %ecx -; FALLBACK19-NEXT: andl $24, %ecx -; FALLBACK19-NEXT: shrdl %cl, %edx, %eax -; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %esi -; FALLBACK19-NEXT: shrdl %cl, %edi, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %edi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %esi -; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %edi -; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %esi, %edx -; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl %edi, %edx -; FALLBACK19-NEXT: shrdl %cl, %eax, %edx -; FALLBACK19-NEXT: shrdl %cl, %edi, %esi -; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK19-NEXT: movl %eax, 56(%ebp) -; FALLBACK19-NEXT: movl %esi, 48(%ebp) -; FALLBACK19-NEXT: movl %edx, 52(%ebp) -; FALLBACK19-NEXT: movl %ebx, 40(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 44(%ebp) -; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 32(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 36(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 24(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 28(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 16(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 20(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 8(%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK19-NEXT: movl %eax, 12(%ebp) -; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK19-NEXT: shrdl %cl, %edx, %edi -; FALLBACK19-NEXT: movl %edi, (%ebp) -; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK19-NEXT: movl %ecx, 4(%ebp) -; FALLBACK19-NEXT: movl %eax, 60(%ebp) -; FALLBACK19-NEXT: addl $188, %esp -; FALLBACK19-NEXT: popl %esi -; FALLBACK19-NEXT: popl %edi -; FALLBACK19-NEXT: popl %ebx -; FALLBACK19-NEXT: popl %ebp -; FALLBACK19-NEXT: retl -; -; FALLBACK20-LABEL: ashr_64bytes: -; FALLBACK20: # %bb.0: -; FALLBACK20-NEXT: pushl %ebp -; FALLBACK20-NEXT: pushl %ebx -; FALLBACK20-NEXT: pushl %edi -; FALLBACK20-NEXT: pushl %esi -; FALLBACK20-NEXT: subl $204, %esp -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: movups (%ecx), %xmm0 -; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK20-NEXT: movl 48(%ecx), %edx -; FALLBACK20-NEXT: movl 52(%ecx), %esi -; FALLBACK20-NEXT: movl 56(%ecx), %edi -; FALLBACK20-NEXT: movl 60(%ecx), %ecx -; FALLBACK20-NEXT: movl (%eax), %eax -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: sarl $31, %ecx -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %esi -; FALLBACK20-NEXT: andl $60, %esi -; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %edi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %edi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %eax, %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK20-NEXT: leal (%edx,%edx), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: addl %edi, %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %eax, %edx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK20-NEXT: sarl %cl, %ebx -; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK20-NEXT: movl %ebx, 60(%eax) -; FALLBACK20-NEXT: movl %edx, 56(%eax) -; FALLBACK20-NEXT: movl %esi, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) -; FALLBACK20-NEXT: movl %edi, 40(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 44(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 32(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 36(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 24(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 28(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 16(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 20(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 8(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 12(%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, (%eax) -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: movl %ecx, 4(%eax) -; FALLBACK20-NEXT: addl $204, %esp -; FALLBACK20-NEXT: popl %esi -; FALLBACK20-NEXT: popl %edi -; FALLBACK20-NEXT: popl %ebx -; FALLBACK20-NEXT: popl %ebp -; FALLBACK20-NEXT: retl -; -; FALLBACK21-LABEL: ashr_64bytes: -; FALLBACK21: # %bb.0: -; FALLBACK21-NEXT: pushl %ebp -; FALLBACK21-NEXT: pushl %ebx -; FALLBACK21-NEXT: pushl %edi -; FALLBACK21-NEXT: pushl %esi -; FALLBACK21-NEXT: subl $188, %esp -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: movups (%eax), %xmm0 -; FALLBACK21-NEXT: movups 16(%eax), %xmm1 -; FALLBACK21-NEXT: movups 32(%eax), %xmm2 -; FALLBACK21-NEXT: movl 48(%eax), %edx -; FALLBACK21-NEXT: movl 52(%eax), %esi -; FALLBACK21-NEXT: movl 56(%eax), %edi -; FALLBACK21-NEXT: movl 60(%eax), %eax -; FALLBACK21-NEXT: movl (%ecx), %ecx -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: sarl $31, %eax -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx -; FALLBACK21-NEXT: shrdl %cl, %edx, %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %esi -; FALLBACK21-NEXT: shrdl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl %esi, %edx -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: sarl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 60(%ebp) -; FALLBACK21-NEXT: movl %esi, 48(%ebp) -; FALLBACK21-NEXT: movl %edi, 52(%ebp) -; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 4(%ebp) -; FALLBACK21-NEXT: addl $188, %esp -; FALLBACK21-NEXT: popl %esi -; FALLBACK21-NEXT: popl %edi -; FALLBACK21-NEXT: popl %ebx -; FALLBACK21-NEXT: popl %ebp -; FALLBACK21-NEXT: retl -; -; FALLBACK22-LABEL: ashr_64bytes: -; FALLBACK22: # %bb.0: -; FALLBACK22-NEXT: pushl %ebp -; FALLBACK22-NEXT: pushl %ebx -; FALLBACK22-NEXT: pushl %edi -; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $204, %esp -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 -; FALLBACK22-NEXT: movl 48(%ecx), %edx -; FALLBACK22-NEXT: movl 52(%ecx), %esi -; FALLBACK22-NEXT: movl 56(%ecx), %edi -; FALLBACK22-NEXT: movl 60(%ecx), %ecx -; FALLBACK22-NEXT: movl (%eax), %eax -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: sarl $31, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: andl $60, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: leal (%edi,%edi), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK22-NEXT: orl %edi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %ecx, %edi -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, (%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) -; FALLBACK22-NEXT: addl $204, %esp -; FALLBACK22-NEXT: popl %esi -; FALLBACK22-NEXT: popl %edi -; FALLBACK22-NEXT: popl %ebx -; FALLBACK22-NEXT: popl %ebp -; FALLBACK22-NEXT: retl -; -; FALLBACK23-LABEL: ashr_64bytes: -; FALLBACK23: # %bb.0: -; FALLBACK23-NEXT: pushl %ebp -; FALLBACK23-NEXT: pushl %ebx -; FALLBACK23-NEXT: pushl %edi -; FALLBACK23-NEXT: pushl %esi -; FALLBACK23-NEXT: subl $188, %esp -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: movups (%eax), %xmm0 -; FALLBACK23-NEXT: movups 16(%eax), %xmm1 -; FALLBACK23-NEXT: movups 32(%eax), %xmm2 -; FALLBACK23-NEXT: movl 48(%eax), %edx -; FALLBACK23-NEXT: movl 52(%eax), %esi -; FALLBACK23-NEXT: movl 56(%eax), %edi -; FALLBACK23-NEXT: movl 60(%eax), %eax -; FALLBACK23-NEXT: movl (%ecx), %ecx -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: sarl $31, %eax -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: movl %ecx, %ebp -; FALLBACK23-NEXT: andl $60, %ebp -; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shll $3, %ecx -; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: shrdl %cl, %edx, %eax -; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %esi -; FALLBACK23-NEXT: shrdl %cl, %edi, %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %edi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %esi -; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %edi -; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %esi, %edx -; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl %edi, %edx -; FALLBACK23-NEXT: shrdl %cl, %eax, %edx -; FALLBACK23-NEXT: shrdl %cl, %edi, %esi -; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK23-NEXT: movl %eax, 56(%ebp) -; FALLBACK23-NEXT: movl %esi, 48(%ebp) -; FALLBACK23-NEXT: movl %edx, 52(%ebp) -; FALLBACK23-NEXT: movl %ebx, 40(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 44(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 32(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 36(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 24(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 28(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 16(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 20(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 8(%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK23-NEXT: movl %eax, 12(%ebp) -; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shrdl %cl, %edx, %edi -; FALLBACK23-NEXT: movl %edi, (%ebp) -; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK23-NEXT: movl %ecx, 4(%ebp) -; FALLBACK23-NEXT: movl %eax, 60(%ebp) -; FALLBACK23-NEXT: addl $188, %esp -; FALLBACK23-NEXT: popl %esi -; FALLBACK23-NEXT: popl %edi -; FALLBACK23-NEXT: popl %ebx -; FALLBACK23-NEXT: popl %ebp -; FALLBACK23-NEXT: retl -; -; FALLBACK24-LABEL: ashr_64bytes: -; FALLBACK24: # %bb.0: -; FALLBACK24-NEXT: pushl %ebp -; FALLBACK24-NEXT: pushl %ebx -; FALLBACK24-NEXT: pushl %edi -; FALLBACK24-NEXT: pushl %esi -; FALLBACK24-NEXT: subl $204, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK24-NEXT: movl 48(%ecx), %edx -; FALLBACK24-NEXT: movl 52(%ecx), %esi -; FALLBACK24-NEXT: movl 56(%ecx), %edi -; FALLBACK24-NEXT: movl 60(%ecx), %ecx -; FALLBACK24-NEXT: movl (%eax), %eax -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: sarl $31, %ecx -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %eax, %esi -; FALLBACK24-NEXT: andl $60, %esi -; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK24-NEXT: shll $3, %eax -; FALLBACK24-NEXT: andl $24, %eax -; FALLBACK24-NEXT: movl %edx, %edi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movb %al, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %edi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %eax, %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK24-NEXT: leal (%edx,%edx), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: addl %edi, %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %eax, %edx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK24-NEXT: sarl %cl, %ebx -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK24-NEXT: movl %ebx, 60(%eax) -; FALLBACK24-NEXT: movl %edx, 56(%eax) -; FALLBACK24-NEXT: movl %esi, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) -; FALLBACK24-NEXT: movl %edi, 40(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 44(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 32(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 36(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 24(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 28(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 16(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 20(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 8(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 12(%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, (%eax) -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: movl %ecx, 4(%eax) -; FALLBACK24-NEXT: addl $204, %esp -; FALLBACK24-NEXT: popl %esi -; FALLBACK24-NEXT: popl %edi -; FALLBACK24-NEXT: popl %ebx -; FALLBACK24-NEXT: popl %ebp -; FALLBACK24-NEXT: vzeroupper -; FALLBACK24-NEXT: retl -; -; FALLBACK25-LABEL: ashr_64bytes: -; FALLBACK25: # %bb.0: -; FALLBACK25-NEXT: pushl %ebp -; FALLBACK25-NEXT: pushl %ebx -; FALLBACK25-NEXT: pushl %edi -; FALLBACK25-NEXT: pushl %esi -; FALLBACK25-NEXT: subl $188, %esp -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: vmovups (%eax), %ymm0 -; FALLBACK25-NEXT: vmovups 32(%eax), %xmm1 -; FALLBACK25-NEXT: movl 48(%eax), %edx -; FALLBACK25-NEXT: movl 52(%eax), %esi -; FALLBACK25-NEXT: movl 56(%eax), %edi -; FALLBACK25-NEXT: movl 60(%eax), %eax -; FALLBACK25-NEXT: movl (%ecx), %ecx -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: sarl $31, %eax -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx -; FALLBACK25-NEXT: shrdl %cl, %edx, %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %esi -; FALLBACK25-NEXT: shrdl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl %esi, %edx -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: sarl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 60(%ebp) -; FALLBACK25-NEXT: movl %esi, 48(%ebp) -; FALLBACK25-NEXT: movl %edi, 52(%ebp) -; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 4(%ebp) -; FALLBACK25-NEXT: addl $188, %esp -; FALLBACK25-NEXT: popl %esi -; FALLBACK25-NEXT: popl %edi -; FALLBACK25-NEXT: popl %ebx -; FALLBACK25-NEXT: popl %ebp -; FALLBACK25-NEXT: vzeroupper -; FALLBACK25-NEXT: retl -; -; FALLBACK26-LABEL: ashr_64bytes: -; FALLBACK26: # %bb.0: -; FALLBACK26-NEXT: pushl %ebp -; FALLBACK26-NEXT: pushl %ebx -; FALLBACK26-NEXT: pushl %edi -; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $204, %esp -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK26-NEXT: movl 48(%ecx), %edx -; FALLBACK26-NEXT: movl 52(%ecx), %esi -; FALLBACK26-NEXT: movl 56(%ecx), %edi -; FALLBACK26-NEXT: movl 60(%ecx), %ecx -; FALLBACK26-NEXT: movl (%eax), %eax -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: sarl $31, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx -; FALLBACK26-NEXT: andl $60, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: leal (%edi,%edi), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK26-NEXT: orl %edi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %eax, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %eax, %edi -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %ecx, %edi -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK26-NEXT: addl %ebp, %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl %edx, 60(%eax) -; FALLBACK26-NEXT: movl %ebx, 56(%eax) -; FALLBACK26-NEXT: movl %edi, 48(%eax) -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl %esi, 40(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 44(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 32(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 36(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 24(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 28(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 16(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 20(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 8(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 12(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, (%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 4(%eax) -; FALLBACK26-NEXT: addl $204, %esp -; FALLBACK26-NEXT: popl %esi -; FALLBACK26-NEXT: popl %edi -; FALLBACK26-NEXT: popl %ebx -; FALLBACK26-NEXT: popl %ebp -; FALLBACK26-NEXT: vzeroupper -; FALLBACK26-NEXT: retl -; -; FALLBACK27-LABEL: ashr_64bytes: -; FALLBACK27: # %bb.0: -; FALLBACK27-NEXT: pushl %ebp -; FALLBACK27-NEXT: pushl %ebx -; FALLBACK27-NEXT: pushl %edi -; FALLBACK27-NEXT: pushl %esi -; FALLBACK27-NEXT: subl $188, %esp -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK27-NEXT: vmovups (%eax), %ymm0 -; FALLBACK27-NEXT: vmovups 32(%eax), %xmm1 -; FALLBACK27-NEXT: movl 48(%eax), %edx -; FALLBACK27-NEXT: movl 52(%eax), %esi -; FALLBACK27-NEXT: movl 56(%eax), %edi -; FALLBACK27-NEXT: movl 60(%eax), %eax -; FALLBACK27-NEXT: movl (%ecx), %ecx -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: sarl $31, %eax -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK27-NEXT: movl %ecx, %ebp -; FALLBACK27-NEXT: andl $60, %ebp -; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shll $3, %ecx -; FALLBACK27-NEXT: andl $24, %ecx -; FALLBACK27-NEXT: shrdl %cl, %edx, %eax -; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %esi -; FALLBACK27-NEXT: shrdl %cl, %edi, %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %edi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %esi -; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %edi -; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %esi, %edx -; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl %edi, %edx -; FALLBACK27-NEXT: shrdl %cl, %eax, %edx -; FALLBACK27-NEXT: shrdl %cl, %edi, %esi -; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK27-NEXT: movl %eax, 56(%ebp) -; FALLBACK27-NEXT: movl %esi, 48(%ebp) -; FALLBACK27-NEXT: movl %edx, 52(%ebp) -; FALLBACK27-NEXT: movl %ebx, 40(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 44(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 32(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 36(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 24(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 28(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 16(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 20(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 8(%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK27-NEXT: movl %eax, 12(%ebp) -; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK27-NEXT: shrdl %cl, %edx, %edi -; FALLBACK27-NEXT: movl %edi, (%ebp) -; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK27-NEXT: movl %ecx, 4(%ebp) -; FALLBACK27-NEXT: movl %eax, 60(%ebp) -; FALLBACK27-NEXT: addl $188, %esp -; FALLBACK27-NEXT: popl %esi -; FALLBACK27-NEXT: popl %edi -; FALLBACK27-NEXT: popl %ebx -; FALLBACK27-NEXT: popl %ebp -; FALLBACK27-NEXT: vzeroupper -; FALLBACK27-NEXT: retl -; -; FALLBACK28-LABEL: ashr_64bytes: -; FALLBACK28: # %bb.0: -; FALLBACK28-NEXT: pushl %ebp -; FALLBACK28-NEXT: pushl %ebx -; FALLBACK28-NEXT: pushl %edi -; FALLBACK28-NEXT: pushl %esi -; FALLBACK28-NEXT: subl $204, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK28-NEXT: movl 48(%ecx), %edx -; FALLBACK28-NEXT: movl 52(%ecx), %esi -; FALLBACK28-NEXT: movl 56(%ecx), %edi -; FALLBACK28-NEXT: movl 60(%ecx), %ecx -; FALLBACK28-NEXT: movl (%eax), %eax -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: sarl $31, %ecx -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %eax, %esi -; FALLBACK28-NEXT: andl $60, %esi -; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK28-NEXT: shll $3, %eax -; FALLBACK28-NEXT: andl $24, %eax -; FALLBACK28-NEXT: movl %edx, %edi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movb %al, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %edi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %eax, %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK28-NEXT: leal (%edx,%edx), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx -; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: addl %edi, %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %eax, %edx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK28-NEXT: sarl %cl, %ebx -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK28-NEXT: movl %ebx, 60(%eax) -; FALLBACK28-NEXT: movl %edx, 56(%eax) -; FALLBACK28-NEXT: movl %esi, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) -; FALLBACK28-NEXT: movl %edi, 40(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 44(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 32(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 36(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 24(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 28(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 16(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 20(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 8(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 12(%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, (%eax) -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: movl %ecx, 4(%eax) -; FALLBACK28-NEXT: addl $204, %esp -; FALLBACK28-NEXT: popl %esi -; FALLBACK28-NEXT: popl %edi -; FALLBACK28-NEXT: popl %ebx -; FALLBACK28-NEXT: popl %ebp -; FALLBACK28-NEXT: vzeroupper -; FALLBACK28-NEXT: retl -; -; FALLBACK29-LABEL: ashr_64bytes: -; FALLBACK29: # %bb.0: -; FALLBACK29-NEXT: pushl %ebp -; FALLBACK29-NEXT: pushl %ebx -; FALLBACK29-NEXT: pushl %edi -; FALLBACK29-NEXT: pushl %esi -; FALLBACK29-NEXT: subl $188, %esp -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: vmovups (%eax), %ymm0 -; FALLBACK29-NEXT: vmovups 32(%eax), %xmm1 -; FALLBACK29-NEXT: movl 48(%eax), %edx -; FALLBACK29-NEXT: movl 52(%eax), %esi -; FALLBACK29-NEXT: movl 56(%eax), %edi -; FALLBACK29-NEXT: movl 60(%eax), %eax -; FALLBACK29-NEXT: movl (%ecx), %ecx -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: sarl $31, %eax -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx -; FALLBACK29-NEXT: shrdl %cl, %edx, %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %esi -; FALLBACK29-NEXT: shrdl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl %esi, %edx -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: sarl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 60(%ebp) -; FALLBACK29-NEXT: movl %esi, 48(%ebp) -; FALLBACK29-NEXT: movl %edi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 4(%ebp) -; FALLBACK29-NEXT: addl $188, %esp -; FALLBACK29-NEXT: popl %esi -; FALLBACK29-NEXT: popl %edi -; FALLBACK29-NEXT: popl %ebx -; FALLBACK29-NEXT: popl %ebp -; FALLBACK29-NEXT: vzeroupper -; FALLBACK29-NEXT: retl -; -; FALLBACK30-LABEL: ashr_64bytes: -; FALLBACK30: # %bb.0: -; FALLBACK30-NEXT: pushl %ebp -; FALLBACK30-NEXT: pushl %ebx -; FALLBACK30-NEXT: pushl %edi -; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $204, %esp -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK30-NEXT: movl 48(%ecx), %edx -; FALLBACK30-NEXT: movl 52(%ecx), %esi -; FALLBACK30-NEXT: movl 56(%ecx), %edi -; FALLBACK30-NEXT: movl 60(%ecx), %ecx -; FALLBACK30-NEXT: movl (%eax), %eax -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: sarl $31, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx -; FALLBACK30-NEXT: andl $60, %ecx -; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: leal (%edi,%edi), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK30-NEXT: orl %edi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %eax, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %eax, %edi -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %ecx, %edi -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK30-NEXT: shrxl %edx, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK30-NEXT: addl %ebp, %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK30-NEXT: orl %eax, %ebx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl %edx, 60(%eax) -; FALLBACK30-NEXT: movl %ebx, 56(%eax) -; FALLBACK30-NEXT: movl %edi, 48(%eax) -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl %esi, 40(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 44(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 32(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 36(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 24(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 28(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 16(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 20(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 8(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 12(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, (%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 4(%eax) -; FALLBACK30-NEXT: addl $204, %esp -; FALLBACK30-NEXT: popl %esi -; FALLBACK30-NEXT: popl %edi -; FALLBACK30-NEXT: popl %ebx -; FALLBACK30-NEXT: popl %ebp -; FALLBACK30-NEXT: vzeroupper -; FALLBACK30-NEXT: retl -; -; FALLBACK31-LABEL: ashr_64bytes: -; FALLBACK31: # %bb.0: -; FALLBACK31-NEXT: pushl %ebp -; FALLBACK31-NEXT: pushl %ebx -; FALLBACK31-NEXT: pushl %edi -; FALLBACK31-NEXT: pushl %esi -; FALLBACK31-NEXT: subl $188, %esp -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK31-NEXT: vmovups (%eax), %ymm0 -; FALLBACK31-NEXT: vmovups 32(%eax), %xmm1 -; FALLBACK31-NEXT: movl 48(%eax), %edx -; FALLBACK31-NEXT: movl 52(%eax), %esi -; FALLBACK31-NEXT: movl 56(%eax), %edi -; FALLBACK31-NEXT: movl 60(%eax), %eax -; FALLBACK31-NEXT: movl (%ecx), %ecx -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: sarl $31, %eax -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK31-NEXT: movl %ecx, %ebp -; FALLBACK31-NEXT: andl $60, %ebp -; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shll $3, %ecx -; FALLBACK31-NEXT: andl $24, %ecx -; FALLBACK31-NEXT: shrdl %cl, %edx, %eax -; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %esi -; FALLBACK31-NEXT: shrdl %cl, %edi, %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %edi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %esi -; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx -; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %edi -; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %esi, %edx -; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx -; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax -; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl %edi, %edx -; FALLBACK31-NEXT: shrdl %cl, %eax, %edx -; FALLBACK31-NEXT: shrdl %cl, %edi, %esi -; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi -; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp -; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill -; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax -; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK31-NEXT: movl %eax, 56(%ebp) -; FALLBACK31-NEXT: movl %esi, 48(%ebp) -; FALLBACK31-NEXT: movl %edx, 52(%ebp) -; FALLBACK31-NEXT: movl %ebx, 40(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 44(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 32(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 36(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 24(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 28(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 16(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 20(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 8(%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK31-NEXT: movl %eax, 12(%ebp) -; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload -; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK31-NEXT: shrdl %cl, %edx, %edi -; FALLBACK31-NEXT: movl %edi, (%ebp) -; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK31-NEXT: movl %ecx, 4(%ebp) -; FALLBACK31-NEXT: movl %eax, 60(%ebp) -; FALLBACK31-NEXT: addl $188, %esp -; FALLBACK31-NEXT: popl %esi -; FALLBACK31-NEXT: popl %edi -; FALLBACK31-NEXT: popl %ebx -; FALLBACK31-NEXT: popl %ebp -; FALLBACK31-NEXT: vzeroupper -; FALLBACK31-NEXT: retl +; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %edi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rdi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %edi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r8, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %r11 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rdi), %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r13,%r13), %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rdi,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r8, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r10, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r15,%r15), %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r12, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r15, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rax,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r14, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r12, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes: +; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rdi), %rax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %edi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rcx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rdi,8), %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %edi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rdi), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rdi), %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rdi,%rdi), %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rax,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rcx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq +; +; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes: +; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r13 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rcx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %edi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rcx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rdi,8), %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %edi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rdi), %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r8 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rdi), %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r12,%r12), %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r14, %r10 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rdi), %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, %r13 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r13 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rdi), %rbp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rbp,%rbp), %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r13, %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r14, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r12, %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rdi,%rdi), %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbp, %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 56(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r12, 48(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, 32(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 40(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 16(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 24(%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, (%rdx) +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq $8, %rsp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r12 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r13 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r14 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r15 +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rcx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r11 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 56(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbx +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r14 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r15 +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes: +; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r10,%r10), %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r11, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r14,%r14), %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r8 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r11 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rbx, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r12,%r12), %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r13, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r15, %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r14, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rbx, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r12, %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r15, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r10, %r10 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rcx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rsi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 56(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 48(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 32(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r13, 40(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r12 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r13 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15 +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes: +; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %r11, %r10 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 40(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r14, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 56(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15 +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq +; +; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%edx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%eax), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%edx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 112(%esp,%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 116(%esp,%edx), %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 120(%esp,%edx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%eax,8), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 120(%esp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 116(%esp,%ebp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 124(%esp,%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes: +; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%eax), %xmm0 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%eax), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%eax), %xmm2 +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2 +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%eax,8), %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 120(%esp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 116(%esp,%ebp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 124(%esp,%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%eax), %xmm0 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%eax), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%eax), %xmm2 +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl +; +; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes: +; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1 +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%ecx), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%eax), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $60, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll $3, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $24, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %ch +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %ch +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%esi), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%esi), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edx,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%esi), %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 112(%esp,%esi), %ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edx, %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 116(%esp,%esi), %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 120(%esp,%edx), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 124(%esp,%edx), %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ebx,%ebx), %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %edx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 60(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 52(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 40(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $204, %esp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%eax), %ymm0 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1 +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%eax), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%eax), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 52(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, (%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $188, %esp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes: +; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1 +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%eax,8), %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 112(%esp,%ecx), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ecx, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 120(%esp,%ebp), %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ecx, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 116(%esp,%ebp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %eax, %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %eax, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 124(%esp,%eax), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %edx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ebx, %eax, %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 60(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 56(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 48(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 52(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 40(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 44(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 32(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 36(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 24(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 16(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 20(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 8(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, (%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%eax) +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $204, %esp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes: +; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%eax), %ymm0 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1 +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%eax), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%ecx), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shll $3, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %eax +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 40(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $188, %esp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper +; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 338e104..221a51e 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -712,33 +712,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -994,42 +994,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1297,33 +1297,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1487,31 +1487,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1761,88 +1761,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2040,32 +2042,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; @@ -2319,97 +2321,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, 92(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2610,31 +2616,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2927,60 +2933,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 32(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3263,13 +3268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -3292,65 +3295,63 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: @@ -3868,20 +3869,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -3906,116 +3907,117 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4388,10 +4390,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax @@ -4419,63 +4419,61 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes: @@ -4972,33 +4970,33 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 @@ -5011,7 +5009,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5032,149 +5030,152 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 60(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -5534,13 +5535,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -5567,65 +5566,63 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: @@ -6221,33 +6218,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi @@ -6256,87 +6251,84 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index c3054a3..6b5c604 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -1635,22 +1635,22 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1807,40 +1807,43 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1906,13 +1909,13 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movb %cl, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2070,13 +2073,13 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movw %cx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2233,13 +2236,13 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movl %ecx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2521,10 +2524,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2541,25 +2545,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp,%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> @@ -2667,21 +2672,21 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r10, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq @@ -2860,33 +2865,33 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 16(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp @@ -3026,9 +3031,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 @@ -3043,38 +3046,36 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -3304,7 +3305,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -3320,59 +3321,60 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 32(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) @@ -3380,7 +3382,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 84c2cc6..bed8e58 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movb %al, (%rdx) @@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl @@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 @@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movw %ax, (%rdx) @@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; @@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movl %eax, (%rdx) @@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; @@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1908,22 +1879,22 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: @@ -2084,40 +2055,43 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index 4d261a9..37620ec 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -820,7 +820,7 @@ define void @infiniteloop() { ; ENABLE-NEXT: movq %rsp, %rax ; ENABLE-NEXT: addq $-16, %rax ; ENABLE-NEXT: movq %rax, %rsp -; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: xorl %ecx, %ecx ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB10_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -851,8 +851,8 @@ define void @infiniteloop() { ; DISABLE-NEXT: ## %bb.1: ## %if.then ; DISABLE-NEXT: movq %rsp, %rax ; DISABLE-NEXT: addq $-16, %rax -; DISABLE-NEXT: %rax, %rsp -; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: movq %rax, %rsp +; DISABLE-NEXT: xorl %ecx, %ecx ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB10_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/x87-stack-pop.mir b/llvm/test/CodeGen/X86/x87-stack-pop.mir index 1c4ffa5..73144fd 100644 --- a/llvm/test/CodeGen/X86/x87-stack-pop.mir +++ b/llvm/test/CodeGen/X86/x87-stack-pop.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=i686-- -run-pass x86-codegen -O2 -o - %s | FileCheck %s +# RUN: llc -mtriple=i686-- -run-pass=x86-fp-stackifier -O2 -o - %s | FileCheck %s +# RUN: llc -mtriple=i686-- -passes=x86-fp-stackifier -O2 -o - %s | FileCheck %s --- name: func_fxam |
