diff options
Diffstat (limited to 'llvm/test/CodeGen/X86')
18 files changed, 963 insertions, 1357 deletions
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll index 87059c5..6ae7b22 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 { ; CHECK-LABEL: @test_no_bitcast( diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll index 5fb2dcd..ca7c357 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) { ; CHECK-LABEL: @test_amx_load_non_O0( diff --git a/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll new file mode 100644 index 0000000..841c9a6 --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK + +define void @test_reloc_none() { +; CHECK-LABEL: test_reloc_none: +; CHECK: # %bb.0: +; CHECK-NEXT: .Lreloc_none0: +; CHECK-NEXT: .reloc .Lreloc_none0, BFD_RELOC_NONE, foo +; CHECK-NEXT: retq + call void @llvm.reloc.none(metadata !"foo") + ret void +} + +declare void @llvm.reloc.none(metadata) diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir index 41e1b5b..5c059a4 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir @@ -1,5 +1,6 @@ -# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 -# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86 +# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64 --- | @@ -30,24 +31,23 @@ ... --- name: test_copy -# ALL-LABEL: name: test_copy alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -56,24 +56,23 @@ body: | ... --- name: test_copy2 -# ALL-LABEL: name: test_copy2 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy2 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -82,30 +81,35 @@ body: | ... --- name: test_copy3 -# ALL-LABEL: name: test_copy3 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr16 = COPY $ax -# X32-NEXT: %3:gr16_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; X86-LABEL: name: test_copy3 + ; X86: liveins: $eax + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy3 + ; X64: liveins: $eax + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s16) = COPY $ax %1(s8) = G_TRUNC %0(s16) %2(s32) = G_ZEXT %1(s8) @@ -115,27 +119,25 @@ body: | ... --- name: test_copy4 -# ALL-LABEL: name: test_copy4 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $eax -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %2:gr32 = MOVZX32rr16 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy4 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $eax %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ZEXT %1(s16) @@ -145,30 +147,35 @@ body: | ... --- name: test_copy5 -# ALL-LABEL: name: test_copy5 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# X32-NEXT: %3:gr32_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; X86-LABEL: name: test_copy5 + ; X86: liveins: $eax, $edx + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy5 + ; X64: liveins: $eax, $edx + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s8) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s8) @@ -178,29 +185,26 @@ body: | ... --- name: test_copy6 -# ALL-LABEL: name: test_copy6 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %3:low32_addr_access_rbp = IMPLICIT_DEF -# ALL-NEXT: %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; CHECK-LABEL: name: test_copy6 + ; CHECK: liveins: $eax, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit + ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s16) diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll index 79849a7..d9b4635 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll @@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08] ; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll index 0f2c75b..01b7618 100644 --- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll @@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256: ; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08] ; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1] @@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8 ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08] ; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index c311ab8..9d31c29 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB9_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl $0, %edx -; X86-NEXT: .LBB9_2: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: notl %esi -; X86-NEXT: notl %edx -; X86-NEXT: je .LBB9_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB9_4: -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $32, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%ebx,%eax), %eax -; X86-NEXT: btl %ecx, %eax +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: movl %edx, (%ebx) -; X86-NEXT: movl %esi, 4(%ebx) +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -600,208 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $96, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %ebx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 64(%esp,%eax), %edx -; X86-NEXT: movl 68(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: movl 72(%esp,%esi), %ebx -; X86-NEXT: movl 76(%esp,%esi), %esi -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: notl %edi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl 36(%esp,%ecx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esp,%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: andl 8(%eax), %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: notl %esi -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl 44(%esp,%eax), %eax -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: andl 12(%ecx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: notl %ebx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl 32(%esp,%eax), %edx -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: andl (%eax), %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: andl 4(%ecx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl 12(%ebp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: andl $96, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: movl (%ecx,%eax), %eax -; X86-NEXT: btl %esi, %eax -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %r9d, %r9d -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: notq %r8 -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %r9, %rax -; SSE-NEXT: notq %rsi -; SSE-NEXT: andq 8(%rdi), %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq (%rdi), %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: andl $96, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: btl %ecx, %eax +; SSE-NEXT: andl $96, %esi +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d ; SSE-NEXT: setae %al -; SSE-NEXT: movq %rsi, (%rdi) -; SSE-NEXT: movq %r8, 8(%rdi) +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rax, %rsi -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: shldq %cl, %rdx, %r8 -; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shlxq %rcx, %rax, %rax -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rsi -; AVX2-NEXT: cmovneq %r9, %rax -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: cmovneq %rdx, %r8 -; AVX2-NEXT: cmovneq %r9, %rdx -; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: andnq (%rdi), %rax, %r8 -; AVX2-NEXT: orq %rdx, %r8 -; AVX2-NEXT: movl %ecx, %eax -; AVX2-NEXT: andl $96, %eax -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: movl (%rdi,%rax), %eax -; AVX2-NEXT: btl %ecx, %eax -; AVX2-NEXT: setae %al -; AVX2-NEXT: movq %r8, (%rdi) -; AVX2-NEXT: movq %rsi, 8(%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rax, %rsi -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shlxq %rcx, %rax, %rax -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: xorl %r9d, %r9d -; AVX512-NEXT: shldq %cl, %rdx, %r9 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: cmovneq %r8, %rax -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: cmovneq %rdx, %r9 -; AVX512-NEXT: cmovneq %r8, %rdx -; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: andnq (%rdi), %rax, %r8 -; AVX512-NEXT: orq %rdx, %r8 -; AVX512-NEXT: movl %ecx, %eax -; AVX512-NEXT: andl $96, %eax -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: movl (%rdi,%rax), %eax -; AVX512-NEXT: btl %ecx, %eax -; AVX512-NEXT: setae %al -; AVX512-NEXT: movq %r8, (%rdi) -; AVX512-NEXT: movq %rsi, 8(%rdi) -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i128: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: andl $96, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -977,673 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $352, %esp # imm = 0x160 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%eax), %edi -; X86-NEXT: movl 44(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%eax), %esi -; X86-NEXT: movl 60(%eax), %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 60(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 52(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 56(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 48(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 52(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 44(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 48(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 40(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 44(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 36(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 40(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 32(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 36(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 28(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 32(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 24(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 28(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 20(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 24(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 20(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 12(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 16(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 8(%eax), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: andl 12(%edx), %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: movl 4(%eax), %edi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: andl 8(%edx), %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: andl 4(%edx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%edx,%eax), %eax -; X86-NEXT: btl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 60(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 56(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 52(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl %ebx, 8(%edx) -; X86-NEXT: movl %edi, 4(%edx) -; X86-NEXT: movl %esi, (%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi ; X86-NEXT: setae %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i512: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: andl $56, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %r12 -; SSE-NEXT: movq 136(%rsp,%r12), %r9 -; SSE-NEXT: movq 144(%rsp,%r12), %rax -; SSE-NEXT: movq %rax, %rsi -; SSE-NEXT: shldq %cl, %r9, %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 152(%rsp,%r12), %r11 -; SSE-NEXT: shldq %cl, %rax, %r11 -; SSE-NEXT: movq 120(%rsp,%r12), %r10 -; SSE-NEXT: movq 128(%rsp,%r12), %rax -; SSE-NEXT: movq %rax, %rbx -; SSE-NEXT: shldq %cl, %r10, %rbx -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: movq 104(%rsp,%r12), %r14 -; SSE-NEXT: movq 112(%rsp,%r12), %rax -; SSE-NEXT: movq %rax, %r15 -; SSE-NEXT: shldq %cl, %r14, %r15 -; SSE-NEXT: shldq %cl, %rax, %r10 -; SSE-NEXT: movq 96(%rsp,%r12), %rax -; SSE-NEXT: movq %rax, %r13 -; SSE-NEXT: shlq %cl, %r13 -; SSE-NEXT: shldq %cl, %rax, %r14 -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 8(%rsp,%r12), %r8 -; SSE-NEXT: movq 16(%rsp,%r12), %rsi -; SSE-NEXT: movq %rsi, %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq 48(%rdi), %rax -; SSE-NEXT: orq %rbp, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: notq %rbx -; SSE-NEXT: notq %r11 -; SSE-NEXT: movq 24(%rsp,%r12), %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq -8(%rsp,%r12), %rbp -; SSE-NEXT: movq (%rsp,%r12), %rdx -; SSE-NEXT: movq %rdx, %rsi -; SSE-NEXT: shldq %cl, %rbp, %rsi -; SSE-NEXT: andq 56(%rdi), %r11 -; SSE-NEXT: andq 32(%rdi), %rbx -; SSE-NEXT: orq %rax, %r11 -; SSE-NEXT: orq %rsi, %rbx -; SSE-NEXT: notq %r15 -; SSE-NEXT: shldq %cl, %rdx, %r8 -; SSE-NEXT: notq %r9 -; SSE-NEXT: andq 40(%rdi), %r9 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: movq -24(%rsp,%r12), %rax -; SSE-NEXT: movq -16(%rsp,%r12), %rdx -; SSE-NEXT: movq %rdx, %rsi -; SSE-NEXT: shldq %cl, %rax, %rsi -; SSE-NEXT: andq 16(%rdi), %r15 -; SSE-NEXT: orq %rsi, %r15 -; SSE-NEXT: shldq %cl, %rdx, %rbp -; SSE-NEXT: notq %r10 -; SSE-NEXT: notq %r13 -; SSE-NEXT: movq -32(%rsp,%r12), %rdx -; SSE-NEXT: movq %rdx, %rsi -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: andq 24(%rdi), %r10 -; SSE-NEXT: andq (%rdi), %r13 -; SSE-NEXT: orq %rbp, %r10 -; SSE-NEXT: orq %rsi, %r13 -; SSE-NEXT: notq %r14 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: andq 8(%rdi), %r14 -; SSE-NEXT: orq %rax, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andl $60, %eax -; SSE-NEXT: movl (%rdi,%rax), %eax -; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload -; SSE-NEXT: btl %ecx, %eax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 48(%rdi) -; SSE-NEXT: movq %r11, 56(%rdi) -; SSE-NEXT: movq %rbx, 32(%rdi) -; SSE-NEXT: movq %r9, 40(%rdi) -; SSE-NEXT: movq %r15, 16(%rdi) -; SSE-NEXT: movq %r10, 24(%rdi) -; SSE-NEXT: movq %r13, (%rdi) -; SSE-NEXT: movq %r14, 8(%rdi) +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: andl $60, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d ; SSE-NEXT: setae %al -; SSE-NEXT: addq $168, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: init_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $184, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: movl %esi, %ebx -; AVX2-NEXT: shrl $3, %ebx -; AVX2-NEXT: movl %ebx, %eax -; AVX2-NEXT: andl $56, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %r11 -; AVX2-NEXT: movq 128(%rsp,%r11), %r15 -; AVX2-NEXT: movq 136(%rsp,%r11), %rax -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 120(%rsp,%r11), %r8 -; AVX2-NEXT: shldq %cl, %r8, %r15 -; AVX2-NEXT: movq 144(%rsp,%r11), %r14 -; AVX2-NEXT: movq 152(%rsp,%r11), %rsi -; AVX2-NEXT: movq %rsi, %r9 -; AVX2-NEXT: shldq %cl, %r14, %r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %r14 -; AVX2-NEXT: movq 112(%rsp,%r11), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 160(%rsp,%r11), %r13 -; AVX2-NEXT: movq 168(%rsp,%r11), %r12 -; AVX2-NEXT: shldq %cl, %r13, %r12 -; AVX2-NEXT: shldq %cl, %rsi, %r13 -; AVX2-NEXT: shldq %cl, %rax, %r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 24(%rsp,%r11), %rbp -; AVX2-NEXT: movq 32(%rsp,%r11), %rdx -; AVX2-NEXT: movq %rdx, %rax -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq 40(%rsp,%r11), %r10 -; AVX2-NEXT: shldq %cl, %rdx, %r10 -; AVX2-NEXT: movq 8(%rsp,%r11), %r9 -; AVX2-NEXT: movq 16(%rsp,%r11), %rdx -; AVX2-NEXT: movq %rdx, %r8 -; AVX2-NEXT: shldq %cl, %r9, %r8 -; AVX2-NEXT: shldq %cl, %rdx, %rbp -; AVX2-NEXT: andnq 48(%rdi), %r13, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq -8(%rsp,%r11), %rax -; AVX2-NEXT: movq (%rsp,%r11), %rdx -; AVX2-NEXT: movq %rdx, %rsi -; AVX2-NEXT: shldq %cl, %rax, %rsi -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: andnq 56(%rdi), %r12, %r12 -; AVX2-NEXT: andnq 32(%rdi), %r14, %r14 -; AVX2-NEXT: orq %r10, %r12 -; AVX2-NEXT: orq %r8, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX2-NEXT: andnq 40(%rdi), %rdx, %rdx -; AVX2-NEXT: orq %rbp, %rdx -; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: movq -16(%rsp,%r11), %r10 -; AVX2-NEXT: shlxq %rcx, %r10, %r11 -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %r10, %rax -; AVX2-NEXT: andnq 16(%rdi), %r15, %rcx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andnq 24(%rdi), %r10, %r10 -; AVX2-NEXT: orq %rsi, %rcx -; AVX2-NEXT: orq %r9, %r10 -; AVX2-NEXT: andnq (%rdi), %r8, %rsi -; AVX2-NEXT: orq %r11, %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andnq 8(%rdi), %r8, %r8 -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: andl $60, %ebx -; AVX2-NEXT: movl (%rdi,%rbx), %eax -; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; AVX2-NEXT: btl %r9d, %eax -; AVX2-NEXT: movq %r13, 48(%rdi) -; AVX2-NEXT: movq %r12, 56(%rdi) -; AVX2-NEXT: movq %r14, 32(%rdi) -; AVX2-NEXT: movq %rdx, 40(%rdi) -; AVX2-NEXT: movq %rcx, 16(%rdi) -; AVX2-NEXT: movq %r10, 24(%rdi) -; AVX2-NEXT: movq %rsi, (%rdi) -; AVX2-NEXT: movq %r8, 8(%rdi) -; AVX2-NEXT: setae %al -; AVX2-NEXT: addq $184, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $168, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: movl %esi, %r10d -; AVX512-NEXT: shrl $3, %r10d -; AVX512-NEXT: movl %r10d, %r8d -; AVX512-NEXT: andl $56, %r8d -; AVX512-NEXT: negl %r8d -; AVX512-NEXT: movslq %r8d, %r9 -; AVX512-NEXT: movq 112(%rsp,%r9), %r11 -; AVX512-NEXT: movq 120(%rsp,%r9), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %r11, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 104(%rsp,%r9), %rax -; AVX512-NEXT: shldq %cl, %rax, %r11 -; AVX512-NEXT: movq 128(%rsp,%r9), %r15 -; AVX512-NEXT: movq 136(%rsp,%r9), %rbp -; AVX512-NEXT: movq %rbp, %rbx -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: shldq %cl, %r14, %r15 -; AVX512-NEXT: movq 144(%rsp,%r9), %r13 -; AVX512-NEXT: movq 152(%rsp,%r9), %r12 -; AVX512-NEXT: shldq %cl, %r13, %r12 -; AVX512-NEXT: movq 96(%rsp,%r9), %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r13 -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 8(%rsp,%r9), %r8 -; AVX512-NEXT: movq 16(%rsp,%r9), %rax -; AVX512-NEXT: movq %rax, %rbp -; AVX512-NEXT: shldq %cl, %r8, %rbp -; AVX512-NEXT: andnq 48(%rdi), %r13, %r13 -; AVX512-NEXT: orq %rbp, %r13 -; AVX512-NEXT: movq 24(%rsp,%r9), %rbp -; AVX512-NEXT: shldq %cl, %rax, %rbp -; AVX512-NEXT: movq -8(%rsp,%r9), %rax -; AVX512-NEXT: movq (%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %rdx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: andnq 56(%rdi), %r12, %r12 -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: andnq 32(%rdi), %r15, %r15 -; AVX512-NEXT: orq %rdx, %r15 -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: movq -24(%rsp,%r9), %rdx -; AVX512-NEXT: movq -16(%rsp,%r9), %rsi -; AVX512-NEXT: movq %rsi, %rbp -; AVX512-NEXT: shldq %cl, %rdx, %rbp -; AVX512-NEXT: andnq 40(%rdi), %rbx, %rbx -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: andnq 16(%rdi), %r11, %r8 -; AVX512-NEXT: orq %rbp, %r8 -; AVX512-NEXT: shlxq %rcx, %r14, %r11 -; AVX512-NEXT: movq -32(%rsp,%r9), %r9 -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: andnq 24(%rdi), %rsi, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %r9, %rax -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %r9, %rdx -; AVX512-NEXT: andnq (%rdi), %r11, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andnq 8(%rdi), %rax, %rax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: andl $60, %r10d -; AVX512-NEXT: movl (%rdi,%r10), %edx -; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload -; AVX512-NEXT: btl %r9d, %edx -; AVX512-NEXT: movq %r13, 48(%rdi) -; AVX512-NEXT: movq %r12, 56(%rdi) -; AVX512-NEXT: movq %r15, 32(%rdi) -; AVX512-NEXT: movq %rbx, 40(%rdi) -; AVX512-NEXT: movq %r8, 16(%rdi) -; AVX512-NEXT: movq %rsi, 24(%rdi) -; AVX512-NEXT: movq %rcx, (%rdi) -; AVX512-NEXT: movq %rax, 8(%rdi) -; AVX512-NEXT: setae %al -; AVX512-NEXT: addq $168, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i512: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $60, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -1698,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_cmpz_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movl 36(%esp,%esi), %eax -; X86-NEXT: movl 40(%esp,%esi), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 32(%esp,%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: xorl 12(%ecx), %esi -; X86-NEXT: xorl 8(%ecx), %edx -; X86-NEXT: xorl 4(%ecx), %eax -; X86-NEXT: xorl (%ecx), %edi -; X86-NEXT: movl %edx, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: andl $96, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: xorl %edx, (%eax,%ecx) +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: orl 12(%eax), %edx +; X86-NEXT: orl 8(%eax), %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: complement_cmpz_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %rsi, %rax -; SSE-NEXT: xorq 8(%rdi), %rdx -; SSE-NEXT: xorq (%rdi), %rax -; SSE-NEXT: movq %rax, (%rdi) -; SSE-NEXT: movq %rdx, 8(%rdi) -; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: shll %cl, %eax +; SSE-NEXT: andl $96, %ecx +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: xorl %eax, (%rdi,%rcx) +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: orq 8(%rdi), %rax ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; -; AVX2-LABEL: complement_cmpz_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shlxq %rcx, %rax, %rax -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rdx -; AVX2-NEXT: cmovneq %rsi, %rax -; AVX2-NEXT: xorq 8(%rdi), %rdx -; AVX2-NEXT: xorq (%rdi), %rax -; AVX2-NEXT: movq %rax, (%rdi) -; AVX2-NEXT: movq %rdx, 8(%rdi) -; AVX2-NEXT: orq %rdx, %rax -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq -; -; AVX512-LABEL: complement_cmpz_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movl $1, %edx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rdx, %rsi -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rdx, %rsi -; AVX512-NEXT: cmovneq %rax, %rdx -; AVX512-NEXT: xorq 8(%rdi), %rsi -; AVX512-NEXT: xorq (%rdi), %rdx -; AVX512-NEXT: movq %rdx, (%rdi) -; AVX512-NEXT: movq %rsi, 8(%rdi) -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; AVX-LABEL: complement_cmpz_i128: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $esi killed $esi def $rsi +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: shlxl %esi, %eax, %eax +; AVX-NEXT: andl $96, %esi +; AVX-NEXT: shrl $3, %esi +; AVX-NEXT: xorl %eax, (%rdi,%rsi) +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: orq 8(%rdi), %rax +; AVX-NEXT: setne %al +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -1821,14 +960,171 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-LABEL: reset_multiload_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: btrl %edx, %ebx +; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl %ebx, (%ecx,%esi) +; X86-NEXT: jae .LBB22_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB22_2: +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl +; +; X64-LABEL: reset_multiload_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %r9d +; X64-NEXT: movl %r9d, %r8d +; X64-NEXT: btrl %esi, %r8d +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: btl %esi, %r9d +; X64-NEXT: jb .LBB22_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: .LBB22_2: +; X64-NEXT: movl %r8d, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %mask = xor i128 %bit, -1 + %ld = load i128, ptr %word + %sel = load i32, ptr %p + %test = and i128 %ld, %bit + %res = and i128 %ld, %mask + %cmp = icmp eq i128 %test, 0 + store i128 %res, ptr %word + %ret = select i1 %cmp, i32 %sel, i32 0 + ret i32 %ret +} + +; Multiple uses of the store chain AND stored value +define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind { +; X86-LABEL: chain_reset_i256: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-2, %edi +; X86-NEXT: roll %cl, %edi +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $28, %ecx +; X86-NEXT: andl %edi, (%esi,%ecx) +; X86-NEXT: movl 8(%esi), %ebx +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 12(%esi), %ebp +; X86-NEXT: orl 28(%esi), %ebp +; X86-NEXT: orl 20(%esi), %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl 24(%esi), %ebx +; X86-NEXT: movl 16(%esi), %ebp +; X86-NEXT: orl %edi, %ebp +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl %edi, (%edx) +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: orl %ecx, %ebp +; X86-NEXT: jne .LBB23_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl %esi, %eax +; X86-NEXT: .LBB23_2: +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; SSE-LABEL: chain_reset_i256: +; SSE: # %bb.0: +; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx +; SSE-NEXT: movl $-2, %eax +; SSE-NEXT: roll %cl, %eax +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: andl $28, %ecx +; SSE-NEXT: andl %eax, (%rdi,%rcx) +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %r8 +; SSE-NEXT: orq 24(%rdi), %r8 +; SSE-NEXT: movq 16(%rdi), %rdi +; SSE-NEXT: orq %rcx, %rdi +; SSE-NEXT: movl (%rsi), %eax +; SSE-NEXT: movl %ecx, (%rsi) +; SSE-NEXT: movl (%rdx), %ecx +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: orq %r8, %rdi +; SSE-NEXT: cmovnel %ecx, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: chain_reset_i256: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx +; AVX-NEXT: movl $-2, %eax +; AVX-NEXT: roll %cl, %eax +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $28, %ecx +; AVX-NEXT: andl %eax, (%rdi,%rcx) +; AVX-NEXT: vmovdqu (%rdi), %ymm0 +; AVX-NEXT: movl (%rdi), %ecx +; AVX-NEXT: movl (%rsi), %eax +; AVX-NEXT: movl %ecx, (%rsi) +; AVX-NEXT: movl (%rdx), %ecx +; AVX-NEXT: addl %ecx, %eax +; AVX-NEXT: vptest %ymm0, %ymm0 +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq + %rem = and i32 %position, 255 + %ofs = zext nneg i32 %rem to i256 + %bit = shl nuw i256 1, %ofs + %ld0 = load i256, ptr %p0 + %msk = xor i256 %bit, -1 + %res = and i256 %ld0, %msk + store i256 %res, ptr %p0 + %cmp = icmp ne i256 %res, 0 + %ld1 = load i32, ptr %p1 + %trunc = trunc i256 %res to i32 + store i32 %trunc, ptr %p1 + %ld2 = load i32, ptr %p2 + %add = add i32 %ld1, %ld2 + %sel = select i1 %cmp, i32 %ld2, i32 %add + ret i32 %sel +} + +; BTC/BT/BTS sequence on same i128 +define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind { +; X86-LABEL: sequence_i128: +; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: subl $144, %esp +; X86-NEXT: movb 20(%ebp), %ch +; X86-NEXT: movb 12(%ebp), %cl ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1842,36 +1138,80 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 40(%esp,%eax), %edx -; X86-NEXT: movl 44(%esp,%eax), %esi +; X86-NEXT: movl 56(%esp,%eax), %edx +; X86-NEXT: movl 60(%esp,%eax), %esi ; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl 32(%esp,%eax), %edi -; X86-NEXT: movl 36(%esp,%eax), %ebx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx ; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: movl 16(%ebp), %eax -; X86-NEXT: movl (%eax), %eax +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movb %ch, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 84(%esp,%eax), %edx +; X86-NEXT: movl 88(%esp,%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 20(%ebp), %ecx +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 92(%esp,%eax), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: andl %ebx, 4(%eax) -; X86-NEXT: shll %cl, %edi -; X86-NEXT: notl %edi -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl $96, %ebx -; X86-NEXT: shrl $3, %ebx -; X86-NEXT: movl (%eax,%ebx), %ebx -; X86-NEXT: andl %edi, (%eax) -; X86-NEXT: notl %esi -; X86-NEXT: andl %esi, 12(%eax) -; X86-NEXT: notl %edx -; X86-NEXT: andl %edx, 8(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: btl %ecx, %ebx -; X86-NEXT: jae .LBB22_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB22_2: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl 12(%eax), %esi +; X86-NEXT: xorl (%eax), %edi +; X86-NEXT: xorl 4(%eax), %ebx +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: andb $96, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 96(%esp,%eax), %eax +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setae %al +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, (%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1879,96 +1219,129 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_multiload_i128: +; SSE-LABEL: sequence_i128: ; SSE: # %bb.0: +; SSE-NEXT: movl %ecx, %eax ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: shlq %cl, %rsi +; SSE-NEXT: movl $1, %r8d +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %r8, %rsi +; SSE-NEXT: movl $1, %r9d +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: xorl %r11d, %r11d ; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %rax, %rsi -; SSE-NEXT: notq %r8 -; SSE-NEXT: notq %rsi -; SSE-NEXT: movl %ecx, %r9d -; SSE-NEXT: andl $96, %r9d -; SSE-NEXT: shrl $3, %r9d -; SSE-NEXT: movl (%rdi,%r9), %r9d -; SSE-NEXT: btl %ecx, %r9d -; SSE-NEXT: jb .LBB22_2 -; SSE-NEXT: # %bb.1: -; SSE-NEXT: movl (%rdx), %eax -; SSE-NEXT: .LBB22_2: -; SSE-NEXT: andq %rsi, (%rdi) -; SSE-NEXT: andq %r8, 8(%rdi) -; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: cmovneq %r9, %rsi +; SSE-NEXT: cmovneq %r11, %r9 +; SSE-NEXT: xorl %r10d, %r10d +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shldq %cl, %r8, %r10 +; SSE-NEXT: shlq %cl, %r8 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: cmovneq %r8, %r10 +; SSE-NEXT: cmovneq %r11, %r8 +; SSE-NEXT: xorq 8(%rdi), %rsi +; SSE-NEXT: xorq (%rdi), %r9 +; SSE-NEXT: movl %edx, %ecx +; SSE-NEXT: andb $32, %cl +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: shrdq %cl, %rsi, %rax +; SSE-NEXT: movq %rsi, %r11 +; SSE-NEXT: shrq %cl, %r11 +; SSE-NEXT: testb $64, %dl +; SSE-NEXT: cmoveq %rax, %r11 +; SSE-NEXT: btl %edx, %r11d +; SSE-NEXT: setae %al +; SSE-NEXT: orq %r10, %rsi +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) ; SSE-NEXT: retq ; -; AVX2-LABEL: reset_multiload_i128: +; AVX2-LABEL: sequence_i128: ; AVX2: # %bb.0: +; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: movl $1, %r8d +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: movl $1, %r10d ; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %r8, %rsi -; AVX2-NEXT: shlxq %rcx, %r8, %r8 +; AVX2-NEXT: shldq %cl, %r10, %rsi +; AVX2-NEXT: shlxq %rcx, %r10, %r8 ; AVX2-NEXT: testb $64, %cl ; AVX2-NEXT: cmovneq %r8, %rsi -; AVX2-NEXT: cmovneq %rax, %r8 -; AVX2-NEXT: notq %rsi -; AVX2-NEXT: notq %r8 -; AVX2-NEXT: movl %ecx, %r9d -; AVX2-NEXT: andl $96, %r9d -; AVX2-NEXT: shrl $3, %r9d -; AVX2-NEXT: movl (%rdi,%r9), %r9d -; AVX2-NEXT: btl %ecx, %r9d -; AVX2-NEXT: jb .LBB22_2 -; AVX2-NEXT: # %bb.1: -; AVX2-NEXT: movl (%rdx), %eax -; AVX2-NEXT: .LBB22_2: -; AVX2-NEXT: andq %r8, (%rdi) -; AVX2-NEXT: andq %rsi, 8(%rdi) -; AVX2-NEXT: # kill: def $eax killed $eax killed $rax +; AVX2-NEXT: cmovneq %r9, %r8 +; AVX2-NEXT: xorl %r11d, %r11d +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: shlxq %rax, %r10, %r10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: cmovneq %r10, %r11 +; AVX2-NEXT: cmovneq %r9, %r10 +; AVX2-NEXT: xorq 8(%rdi), %rsi +; AVX2-NEXT: xorq (%rdi), %r8 +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: andb $32, %cl +; AVX2-NEXT: movq %r8, %rax +; AVX2-NEXT: shrdq %cl, %rsi, %rax +; AVX2-NEXT: shrxq %rcx, %rsi, %rcx +; AVX2-NEXT: testb $64, %dl +; AVX2-NEXT: cmoveq %rax, %rcx +; AVX2-NEXT: btl %edx, %ecx +; AVX2-NEXT: setae %al +; AVX2-NEXT: orq %r11, %rsi +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) ; AVX2-NEXT: retq ; -; AVX512-LABEL: reset_multiload_i128: +; AVX512-LABEL: sequence_i128: ; AVX512: # %bb.0: +; AVX512-NEXT: movl %ecx, %eax ; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %r8d +; AVX512-NEXT: movl $1, %r9d ; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %r8, %rsi -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: shlxq %rcx, %r8, %r8 +; AVX512-NEXT: shldq %cl, %r9, %rsi +; AVX512-NEXT: xorl %r10d, %r10d +; AVX512-NEXT: shlxq %rcx, %r9, %r8 ; AVX512-NEXT: testb $64, %cl ; AVX512-NEXT: cmovneq %r8, %rsi -; AVX512-NEXT: cmovneq %rax, %r8 -; AVX512-NEXT: notq %rsi -; AVX512-NEXT: notq %r8 -; AVX512-NEXT: movl %ecx, %r9d -; AVX512-NEXT: andl $96, %r9d -; AVX512-NEXT: shrl $3, %r9d -; AVX512-NEXT: movl (%rdi,%r9), %r9d -; AVX512-NEXT: btl %ecx, %r9d -; AVX512-NEXT: jb .LBB22_2 -; AVX512-NEXT: # %bb.1: -; AVX512-NEXT: movl (%rdx), %eax -; AVX512-NEXT: .LBB22_2: -; AVX512-NEXT: andq %r8, (%rdi) -; AVX512-NEXT: andq %rsi, 8(%rdi) -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: cmovneq %r10, %r8 +; AVX512-NEXT: xorl %r11d, %r11d +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: shldq %cl, %r9, %r11 +; AVX512-NEXT: shlxq %rax, %r9, %r9 +; AVX512-NEXT: testb $64, %al +; AVX512-NEXT: cmovneq %r9, %r11 +; AVX512-NEXT: cmovneq %r10, %r9 +; AVX512-NEXT: xorq 8(%rdi), %rsi +; AVX512-NEXT: xorq (%rdi), %r8 +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $32, %cl +; AVX512-NEXT: movq %r8, %rax +; AVX512-NEXT: shrdq %cl, %rsi, %rax +; AVX512-NEXT: shrxq %rcx, %rsi, %rcx +; AVX512-NEXT: testb $64, %dl +; AVX512-NEXT: cmoveq %rax, %rcx +; AVX512-NEXT: btl %edx, %ecx +; AVX512-NEXT: setae %al +; AVX512-NEXT: orq %r11, %rsi +; AVX512-NEXT: orq %r9, %r8 +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq %rsi, 8(%rdi) ; AVX512-NEXT: retq - %rem = and i32 %position, 127 - %ofs = zext nneg i32 %rem to i128 - %bit = shl nuw i128 1, %ofs - %mask = xor i128 %bit, -1 + %rem0 = and i32 %pos0, 127 + %rem1 = and i32 %pos1, 127 + %rem2 = and i32 %pos2, 127 + %ofs0 = zext nneg i32 %rem0 to i128 + %ofs1 = zext nneg i32 %rem1 to i128 + %ofs2 = zext nneg i32 %rem2 to i128 + %bit0 = shl nuw i128 1, %ofs0 + %bit1 = shl nuw i128 1, %ofs1 + %bit2 = shl nuw i128 1, %ofs2 %ld = load i128, ptr %word - %sel = load i32, ptr %p - %test = and i128 %ld, %bit - %res = and i128 %ld, %mask - %cmp = icmp eq i128 %test, 0 - store i128 %res, ptr %word - %ret = select i1 %cmp, i32 %sel, i32 0 - ret i32 %ret + %res0 = xor i128 %ld, %bit0 + %test1 = and i128 %res0, %bit1 + %cmp1 = icmp eq i128 %test1, 0 + %res2 = or i128 %res0, %bit2 + store i128 %res2, ptr %word + ret i1 %cmp1 } diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll index f36baba..ab8498d 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll @@ -14,7 +14,6 @@ entry: } ; CHECK: _ZL10myCallbacki: -; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define internal void @_ZL10myCallbacki(i32 %value) !type !2 { entry: %sink = alloca i32, align 4 @@ -33,6 +32,6 @@ entry: ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. ; CHECK-NEXT: .byte 1 ;; Function Entry PC -; CHECK-NEXT: .quad [[LABEL_FUNC]] +; CHECK-NEXT: .quad _ZL10myCallbacki ;; Function type ID ; CHECK-NEXT: .quad -5212364466660467813 diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll index cdbad66..02d7107 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll @@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8) declare !type !2 ptr @direct_baz(ptr) ; CHECK: ball: -; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define ptr @ball() { entry: call void @direct_foo() @@ -42,7 +41,7 @@ entry: ;; Flags ; CHECK-NEXT: .byte 7 ;; Function Entry PC -; CHECK-NEXT: .quad [[LABEL_FUNC]] +; CHECK-NEXT: .quad ball ;; Function type ID -- set to 0 as no type metadata attached to function. ; CHECK-NEXT: .quad 0 ;; Number of unique direct callees. diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index feac3dc..30f1874 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_shl_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_lshr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_ashr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; GFNIAVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/narrow-add-i64.ll b/llvm/test/CodeGen/X86/narrow-add-i64.ll new file mode 100644 index 0000000..a7a54fd --- /dev/null +++ b/llvm/test/CodeGen/X86/narrow-add-i64.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 + +define i64 @test_add_i64_i16_const(i16 %a) nounwind { +; X86-LABEL: test_add_i64_i16_const: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $42, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: test_add_i64_i16_const: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: addq $42, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %sum = add nuw nsw i64 %zext_a, 42 + ret i64 %sum +} + +; TODO: First 48 bits are all zeros so we can safely truncate to 32 bit additon +define i64 @test_add_i64_i16_zext(i16 %a, i16 %b) nounwind { +; X86-LABEL: test_add_i64_i16_zext: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: test_add_i64_i16_zext: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %zext_b = zext i16 %b to i64 + %sum = add nuw nsw i64 %zext_a, %zext_b + ret i64 %sum +} + +; Negative: Set the 32nd bit of a to force 64 bit addition, we do not truncate to 32 bit addition in this case +define i64 @negative_test_add_i64_i16(i16 %a) nounwind { +; X86-LABEL: negative_test_add_i64_i16: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $42, %eax +; X86-NEXT: movl $1, %edx +; X86-NEXT: retl +; +; X64-LABEL: negative_test_add_i64_i16: +; X64: # %bb.0: +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movabsq $4294967338, %rax # imm = 0x10000002A +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %zext_a = zext i16 %a to i64 + %or_a = or i64 %zext_a, 4294967296 + %sum = add nuw nsw i64 %or_a, 42 + ret i64 %sum +} + +; Negative: We don't truncate to 32 bit addition in case of sign extension +define i64 @negative_test_add_i64_i16_sext(i16 %a, i16 %b) nounwind { +; X86-LABEL: negative_test_add_i64_i16_sext: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: negative_test_add_i64_i16_sext: +; X64: # %bb.0: +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: retq + %sext_a = sext i16 %a to i64 + %sext_b = sext i16 %b to i64 + %sum = add nuw nsw i64 %sext_a, %sext_b + ret i64 %sum +} diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll new file mode 100644 index 0000000..aef44cc --- /dev/null +++ b/llvm/test/CodeGen/X86/pr166534.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 + +define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) { +; SSE2-LABEL: pr166534: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movq 8(%rdi), %r8 +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movq (%rsi), %r9 +; SSE2-NEXT: movq 8(%rsi), %rdi +; SSE2-NEXT: movdqu (%rsi), %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %esi +; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF +; SSE2-NEXT: sete %r10b +; SSE2-NEXT: orq %r10, (%rdx) +; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF +; SSE2-NEXT: jne .LBB0_2 +; SSE2-NEXT: # %bb.1: # %if.then +; SSE2-NEXT: xorq %r9, %rax +; SSE2-NEXT: xorq %rdi, %r8 +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: orq %rax, %r8 +; SSE2-NEXT: sete %dl +; SSE2-NEXT: orq %rdx, (%rcx) +; SSE2-NEXT: .LBB0_2: # %if.end +; SSE2-NEXT: retq +; +; SSE4-LABEL: pr166534: +; SSE4: # %bb.0: # %entry +; SSE4-NEXT: movq (%rdi), %rax +; SSE4-NEXT: movq 8(%rdi), %r8 +; SSE4-NEXT: movdqu (%rdi), %xmm0 +; SSE4-NEXT: movq (%rsi), %r9 +; SSE4-NEXT: movq 8(%rsi), %rdi +; SSE4-NEXT: movdqu (%rsi), %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: xorl %esi, %esi +; SSE4-NEXT: ptest %xmm1, %xmm1 +; SSE4-NEXT: sete %sil +; SSE4-NEXT: orq %rsi, (%rdx) +; SSE4-NEXT: ptest %xmm1, %xmm1 +; SSE4-NEXT: jne .LBB0_2 +; SSE4-NEXT: # %bb.1: # %if.then +; SSE4-NEXT: xorq %r9, %rax +; SSE4-NEXT: xorq %rdi, %r8 +; SSE4-NEXT: xorl %edx, %edx +; SSE4-NEXT: orq %rax, %r8 +; SSE4-NEXT: sete %dl +; SSE4-NEXT: orq %rdx, (%rcx) +; SSE4-NEXT: .LBB0_2: # %if.end +; SSE4-NEXT: retq +; +; AVX2-LABEL: pr166534: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: movq 8(%rdi), %r8 +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: movq (%rsi), %rdi +; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: movq 8(%rsi), %rsi +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: vptest %xmm0, %xmm0 +; AVX2-NEXT: sete %r9b +; AVX2-NEXT: orq %r9, (%rdx) +; AVX2-NEXT: vptest %xmm0, %xmm0 +; AVX2-NEXT: jne .LBB0_2 +; AVX2-NEXT: # %bb.1: # %if.then +; AVX2-NEXT: xorq %rdi, %rax +; AVX2-NEXT: xorq %rsi, %r8 +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: orq %rax, %r8 +; AVX2-NEXT: sete %dl +; AVX2-NEXT: orq %rdx, (%rcx) +; AVX2-NEXT: .LBB0_2: # %if.end +; AVX2-NEXT: retq +; +; AVX512-LABEL: pr166534: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: movq 8(%rdi), %r8 +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: movq (%rsi), %r9 +; AVX512-NEXT: movq 8(%rsi), %rdi +; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: sete %sil +; AVX512-NEXT: orq %rsi, (%rdx) +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: jne .LBB0_2 +; AVX512-NEXT: # %bb.1: # %if.then +; AVX512-NEXT: xorq %r9, %rax +; AVX512-NEXT: xorq %rdi, %r8 +; AVX512-NEXT: xorl %edx, %edx +; AVX512-NEXT: orq %rax, %r8 +; AVX512-NEXT: sete %dl +; AVX512-NEXT: orq %rdx, (%rcx) +; AVX512-NEXT: .LBB0_2: # %if.end +; AVX512-NEXT: retq +entry: + %a = load i128, ptr %pa, align 8 + %b = load i128, ptr %pb, align 8 + %cmp = icmp eq i128 %a, %b + %conv1 = zext i1 %cmp to i128 + %c = load i128, ptr %pc, align 8 + %or = or i128 %c, %conv1 + store i128 %or, ptr %pc, align 8 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %d = load i128, ptr %pd, align 8 + %or7 = or i128 %d, %conv1 + store i128 %or7, ptr %pd, align 8 + br label %if.end + +if.end: + ret void +} diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll new file mode 100644 index 0000000..21b25d8 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr166744.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA +; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA + +; Ensure reloads are after narrowed i512 -> i32 store +define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { +; POSTRA-LABEL: PR166744: +; POSTRA: # %bb.0: +; POSTRA-NEXT: movl $1029, %eax # imm = 0x405 +; POSTRA-NEXT: shlxl %esi, %edx, %edx +; POSTRA-NEXT: bextrl %eax, %esi, %eax +; POSTRA-NEXT: movl (%rdi,%rax,4), %ecx +; POSTRA-NEXT: btrl %esi, %ecx +; POSTRA-NEXT: orl %ecx, %edx +; POSTRA-NEXT: movl %edx, (%rdi,%rax,4) +; POSTRA-NEXT: movq 16(%rdi), %rax +; POSTRA-NEXT: movq (%rdi), %rcx +; POSTRA-NEXT: movq 24(%rdi), %rdx +; POSTRA-NEXT: movq 8(%rdi), %rsi +; POSTRA-NEXT: orq 56(%rdi), %rdx +; POSTRA-NEXT: orq 40(%rdi), %rsi +; POSTRA-NEXT: orq 48(%rdi), %rax +; POSTRA-NEXT: orq 32(%rdi), %rcx +; POSTRA-NEXT: orq %rdx, %rsi +; POSTRA-NEXT: orq %rax, %rcx +; POSTRA-NEXT: orq %rsi, %rcx +; POSTRA-NEXT: setne %al +; POSTRA-NEXT: retq +; +; NOPOSTRA-LABEL: PR166744: +; NOPOSTRA: # %bb.0: +; NOPOSTRA-NEXT: movl %esi, %eax +; NOPOSTRA-NEXT: shrl $3, %eax +; NOPOSTRA-NEXT: andl $60, %eax +; NOPOSTRA-NEXT: movl (%rdi,%rax), %ecx +; NOPOSTRA-NEXT: btrl %esi, %ecx +; NOPOSTRA-NEXT: shlxl %esi, %edx, %edx +; NOPOSTRA-NEXT: orl %ecx, %edx +; NOPOSTRA-NEXT: movl %edx, (%rdi,%rax) +; NOPOSTRA-NEXT: movq 16(%rdi), %rax +; NOPOSTRA-NEXT: movq (%rdi), %rcx +; NOPOSTRA-NEXT: movq 8(%rdi), %rdx +; NOPOSTRA-NEXT: movq 24(%rdi), %rsi +; NOPOSTRA-NEXT: orq 56(%rdi), %rsi +; NOPOSTRA-NEXT: orq 40(%rdi), %rdx +; NOPOSTRA-NEXT: orq 48(%rdi), %rax +; NOPOSTRA-NEXT: orq 32(%rdi), %rcx +; NOPOSTRA-NEXT: orq %rsi, %rdx +; NOPOSTRA-NEXT: orq %rax, %rcx +; NOPOSTRA-NEXT: orq %rdx, %rcx +; NOPOSTRA-NEXT: setne %al +; NOPOSTRA-NEXT: retq + %rem = and i64 %idx, 511 + %sh_prom = zext nneg i64 %rem to i512 + %shl = shl nuw i512 1, %sh_prom + %not = xor i512 %shl, -1 + %load = load i512, ptr %v, align 8 + %and = and i512 %load, %not + %conv2 = zext i1 %b to i512 + %shl4 = shl nuw i512 %conv2, %sh_prom + %or = or i512 %and, %shl4 + store i512 %or, ptr %v, align 8 + %cmp = icmp ne i512 %or, 0 + ret i1 %cmp +} diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 0fb0420..aff2228 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 103d570..4450d07 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index efd7429..41238ac 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/issue163738.ll b/llvm/test/CodeGen/X86/vpternlog.ll index 61fe043..bd7478d 100644 --- a/llvm/test/CodeGen/X86/issue163738.ll +++ b/llvm/test/CodeGen/X86/vpternlog.ll @@ -11,3 +11,15 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) { %and3 = xor <8 x i64> %and3.demorgan, splat (i64 -1) ret <8 x i64> %and3 } + +define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) { +; CHECK-LABEL: xorbitcast: +; CHECK: # %bb.0: +; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1) +; CHECK-NEXT: retq + %or1 = or <64 x i8> %a, %b + %or2 = or <64 x i8> %or1, %c + %cast = bitcast <64 x i8> %or2 to <8 x i64> + %xor = xor <8 x i64> %cast, splat (i64 -1) + ret <8 x i64> %xor +} diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 84c2cc6..7735500 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movb %al, (%rdx) @@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl @@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 @@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movw %ax, (%rdx) @@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; @@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca: ; X64-NO-BMI2: # %bb.0: -; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movl %eax, (%rdx) @@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; @@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi @@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 @@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi @@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi @@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx @@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi |
