; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s --check-prefixes=AVX512F ; Tests for BMI1/BMI2/TBM style bit manipulations that could potentially stay on the predicate registers ; ANDNOT - Logical and not define <8 x i64> @andnot_v8i64(<8 x i64> %a0, <8 x i64> %a1, i8 %a2) { ; AVX512-LABEL: andnot_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: notb %dil ; AVX512-NEXT: kmovd %edi, %k1 ; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: andnot_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: notb %dil ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq %cmp = icmp sgt <8 x i64> %a0, %a1 %mask = bitcast <8 x i1> %cmp to i8 %not = xor i8 %a2, -1 %andnot = and i8 %mask, %not %sel = bitcast i8 %andnot to <8 x i1> %add = add <8 x i64> %a0, %a1 %res = select <8 x i1> %sel, <8 x i64> %a0, <8 x i64> %add ret <8 x i64> %res } ; BEXTR - Bit field extract (register) define <32 x i16> @bextr_reg_v32i16(<32 x i16> %a0, <32 x i16> %a1, i32 %idx, i32 %len) { ; AVX512-LABEL: bextr_reg_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: shrxl %edi, %eax, %eax ; AVX512-NEXT: bzhil %esi, %eax, %eax ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: bextr_reg_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 ; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: shll $16, %ecx ; AVX512F-NEXT: orl %eax, %ecx ; AVX512F-NEXT: shrxl %edi, %ecx, %eax ; AVX512F-NEXT: bzhil %esi, %eax, %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: shrl $16, %eax ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq %cmp = icmp sgt <32 x i16> %a0, %a1 %mask = bitcast <32 x i1> %cmp to i32 %shift = lshr i32 %mask, %idx %bit = shl i32 1, %len %msk = sub i32 %bit, 1 %bextr = and i32 %shift, %msk %sel = bitcast i32 %bextr to <32 x i1> %add = add <32 x i16> %a0, %a1 %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add ret <32 x i16> %res } ; BEXTR - Bit field extract (immediate) define <32 x i16> @bextr_imm_v32i16(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512-LABEL: bextr_imm_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: shrl $2, %eax ; AVX512-NEXT: andl $7, %eax ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: bextr_imm_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: shrl $2, %eax ; AVX512F-NEXT: andl $7, %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq %cmp = icmp sgt <32 x i16> %a0, %a1 %mask = bitcast <32 x i1> %cmp to i32 %shift = lshr i32 %mask, 2 %bextr = and i32 %shift, 7 %sel = bitcast i32 %bextr to <32 x i1> %add = add <32 x i16> %a0, %a1 %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add ret <32 x i16> %res } ; BLSI - Extract lowest set isolated bit (x & -x) define <64 x i8> @blsi_v64i8(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512-LABEL: blsi_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovq %k0, %rax ; AVX512-NEXT: blsiq %rax, %rax ; AVX512-NEXT: kmovq %rax, %k1 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blsi_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpmovmskb %ymm5, %eax ; AVX512F-NEXT: vpmovmskb %ymm4, %ecx ; AVX512F-NEXT: shlq $32, %rcx ; AVX512F-NEXT: orq %rax, %rcx ; AVX512F-NEXT: blsiq %rcx, %rax ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: movl %eax, %edx ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: shrq $32, %rax ; AVX512F-NEXT: shrq $48, %rcx ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: kmovw %edx, %k2 ; AVX512F-NEXT: kmovw %ecx, %k3 ; AVX512F-NEXT: kmovw %eax, %k4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq %cmp = icmp sgt <64 x i8> %a0, %a1 %mask = bitcast <64 x i1> %cmp to i64 %neg = sub i64 0, %mask %blsi = and i64 %mask, %neg %sel = bitcast i64 %blsi to <64 x i1> %add = add <64 x i8> %a0, %a1 %res = select <64 x i1> %sel, <64 x i8> %a0, <64 x i8> %add ret <64 x i8> %res } ; BLSMSK - Get mask up to lowest set bit (x ^ (x - 1)) define <16 x float> @blsmsk_v16f32(<16 x float> %a0, <16 x float> %a1) { ; AVX512-LABEL: blsmsk_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: blsmskl %eax, %eax ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovaps %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blsmsk_v16f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: blsmskl %eax, %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vaddps %zmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovaps %zmm1, %zmm0 ; AVX512F-NEXT: retq %cmp = fcmp ogt <16 x float> %a0, %a1 %mask = bitcast <16 x i1> %cmp to i16 %dec = sub i16 %mask, 1 %blsmsk = xor i16 %mask, %dec %sel = bitcast i16 %blsmsk to <16 x i1> %add = fadd <16 x float> %a0, %a1 %res = select <16 x i1> %sel, <16 x float> %a0, <16 x float> %add ret <16 x float> %res } ; BLSR - Reset lowest set bit (x & (x - 1)) define <8 x double> @blsr_v8f64(<8 x double> %a0, <8 x double> %a1) { ; AVX512-LABEL: blsr_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: leal -1(%rax), %ecx ; AVX512-NEXT: andb %al, %cl ; AVX512-NEXT: kmovd %ecx, %k1 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovapd %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovapd %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blsr_v8f64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: leal -1(%rax), %ecx ; AVX512F-NEXT: andb %al, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vaddpd %zmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovapd %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovapd %zmm1, %zmm0 ; AVX512F-NEXT: retq %cmp = fcmp ogt <8 x double> %a0, %a1 %mask = bitcast <8 x i1> %cmp to i8 %dec = sub i8 %mask, 1 %blsr = and i8 %mask, %dec %sel = bitcast i8 %blsr to <8 x i1> %add = fadd <8 x double> %a0, %a1 %res = select <8 x i1> %sel, <8 x double> %a0, <8 x double> %add ret <8 x double> %res } ; BZHI - Zero high bits starting from specified index (x & ((1 << idx) - 1)) define <16 x i32> @bzhi_v16i32(<16 x i32> %a0, <16 x i32> %a1, i16 %idx) { ; AVX512-LABEL: bzhi_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: movl $-1, %eax ; AVX512-NEXT: bzhil %edi, %eax, %eax ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: bzhi_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movl $-1, %eax ; AVX512F-NEXT: bzhil %edi, %eax, %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq %cmp = icmp ugt <16 x i32> %a0, %a1 %mask = bitcast <16 x i1> %cmp to i16 %bit = shl i16 1, %idx %msk = sub i16 %bit, 1 %bzhi = and i16 %mask, %msk %sel = bitcast i16 %bzhi to <16 x i1> %add = add <16 x i32> %a0, %a1 %res = select <16 x i1> %sel, <16 x i32> %a0, <16 x i32> %add ret <16 x i32> %res } ; BLCFILL - Fill from lowest clear bit (x & (x + 1)) define <32 x i16> @blcfill_v32i16(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512-LABEL: blcfill_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: leal 1(%rax), %ecx ; AVX512-NEXT: andl %eax, %ecx ; AVX512-NEXT: kmovd %ecx, %k1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blcfill_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 ; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: shll $16, %ecx ; AVX512F-NEXT: leal (%rax,%rcx), %edx ; AVX512F-NEXT: addl $1, %edx ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512F-NEXT: orl %ecx, %eax ; AVX512F-NEXT: andl %eax, %edx ; AVX512F-NEXT: kmovw %edx, %k1 ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: kmovw %edx, %k2 ; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq %cmp = icmp sgt <32 x i16> %a0, %a1 %mask = bitcast <32 x i1> %cmp to i32 %inc = add i32 %mask, 1 %blcfill = and i32 %mask, %inc %sel = bitcast i32 %blcfill to <32 x i1> %add = add <32 x i16> %a0, %a1 %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add ret <32 x i16> %res } ; BLCI - Isolate lowest clear bit (x | ~(x + 1)) define <64 x i8> @blci_v64i8(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512-LABEL: blci_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovq %k0, %rax ; AVX512-NEXT: leaq 1(%rax), %rcx ; AVX512-NEXT: notq %rcx ; AVX512-NEXT: orq %rax, %rcx ; AVX512-NEXT: kmovq %rcx, %k1 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blci_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpmovmskb %ymm5, %eax ; AVX512F-NEXT: vpmovmskb %ymm4, %ecx ; AVX512F-NEXT: shlq $32, %rcx ; AVX512F-NEXT: leaq (%rax,%rcx), %rdx ; AVX512F-NEXT: addq %rcx, %rax ; AVX512F-NEXT: addq $1, %rax ; AVX512F-NEXT: notq %rax ; AVX512F-NEXT: orq %rdx, %rax ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: movl %eax, %edx ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: shrq $32, %rax ; AVX512F-NEXT: shrq $48, %rcx ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: kmovw %edx, %k2 ; AVX512F-NEXT: kmovw %ecx, %k3 ; AVX512F-NEXT: kmovw %eax, %k4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq %cmp = icmp sgt <64 x i8> %a0, %a1 %mask = bitcast <64 x i1> %cmp to i64 %inc = add i64 %mask, 1 %not = xor i64 %inc, -1 %blci = or i64 %mask, %not %sel = bitcast i64 %blci to <64 x i1> %add = add <64 x i8> %a0, %a1 %res = select <64 x i1> %sel, <64 x i8> %a0, <64 x i8> %add ret <64 x i8> %res } ; BLCIC - Isolate lowest clear bit and complement (~x & (x + 1)) define <8 x i64> @blcic_v8i64(<8 x i64> %a0, <8 x i64> %a1) { ; AVX512-LABEL: blcic_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movl %eax, %ecx ; AVX512-NEXT: notb %cl ; AVX512-NEXT: incb %al ; AVX512-NEXT: andb %cl, %al ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blcic_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx ; AVX512F-NEXT: notb %cl ; AVX512F-NEXT: addb $1, %al ; AVX512F-NEXT: andb %cl, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq %cmp = icmp uge <8 x i64> %a0, %a1 %mask = bitcast <8 x i1> %cmp to i8 %not = xor i8 %mask, -1 %inc = add i8 %mask, 1 %blcic = and i8 %not, %inc %sel = bitcast i8 %blcic to <8 x i1> %add = add <8 x i64> %a0, %a1 %res = select <8 x i1> %sel, <8 x i64> %a0, <8 x i64> %add ret <8 x i64> %res } ; BLCMSK - Mask from lowest clear bit (x ^ (x + 1)) define <32 x i16> @blcmsk_v32i16(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512-LABEL: blcmsk_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: leal 1(%rax), %ecx ; AVX512-NEXT: xorl %eax, %ecx ; AVX512-NEXT: kmovd %ecx, %k1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blcmsk_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 ; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: shll $16, %ecx ; AVX512F-NEXT: leal (%rax,%rcx), %edx ; AVX512F-NEXT: addl $1, %edx ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512F-NEXT: orl %ecx, %eax ; AVX512F-NEXT: xorl %eax, %edx ; AVX512F-NEXT: kmovw %edx, %k1 ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: kmovw %edx, %k2 ; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq %cmp = icmp sgt <32 x i16> %a0, %a1 %mask = bitcast <32 x i1> %cmp to i32 %inc = add i32 %mask, 1 %blcmsk = xor i32 %mask, %inc %sel = bitcast i32 %blcmsk to <32 x i1> %add = add <32 x i16> %a0, %a1 %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add ret <32 x i16> %res } ; BLCS - Set lowest clear bit (x | (x + 1)) define <16 x float> @blcs_v16f32(<16 x float> %a0, <16 x float> %a1) { ; AVX512-LABEL: blcs_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: leal 1(%rax), %ecx ; AVX512-NEXT: orl %eax, %ecx ; AVX512-NEXT: kmovd %ecx, %k1 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovaps %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blcs_v16f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: leal 1(%rax), %ecx ; AVX512F-NEXT: orl %eax, %ecx ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vaddps %zmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovaps %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovaps %zmm1, %zmm0 ; AVX512F-NEXT: retq %cmp = fcmp ogt <16 x float> %a0, %a1 %mask = bitcast <16 x i1> %cmp to i16 %inc = add i16 %mask, 1 %blcs = or i16 %mask, %inc %sel = bitcast i16 %blcs to <16 x i1> %add = fadd <16 x float> %a0, %a1 %res = select <16 x i1> %sel, <16 x float> %a0, <16 x float> %add ret <16 x float> %res } ; BLSFILL - Fill from lowest set bit (x | (x - 1)) define <16 x i32> @blcs_v16i32(<16 x i32> %a0, <16 x i32> %a1) { ; AVX512-LABEL: blcs_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpled %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: leal -1(%rax), %ecx ; AVX512-NEXT: orl %eax, %ecx ; AVX512-NEXT: kmovd %ecx, %k1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blcs_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpled %zmm1, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: leal -1(%rax), %ecx ; AVX512F-NEXT: orl %eax, %ecx ; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq %cmp = icmp sle <16 x i32> %a0, %a1 %mask = bitcast <16 x i1> %cmp to i16 %dec = sub i16 %mask, 1 %blsfill = or i16 %mask, %dec %sel = bitcast i16 %blsfill to <16 x i1> %add = add <16 x i32> %a0, %a1 %res = select <16 x i1> %sel, <16 x i32> %a0, <16 x i32> %add ret <16 x i32> %res } ; BLSIC - Isolate lowest set bit and complement (~x | (x - 1)) define <8 x double> @blsic_v8f64(<8 x double> %a0, <8 x double> %a1) { ; AVX512-LABEL: blsic_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movl %eax, %ecx ; AVX512-NEXT: notb %cl ; AVX512-NEXT: decb %al ; AVX512-NEXT: orb %cl, %al ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovapd %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovapd %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: blsic_v8f64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx ; AVX512F-NEXT: notb %cl ; AVX512F-NEXT: addb $-1, %al ; AVX512F-NEXT: orb %cl, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vaddpd %zmm1, %zmm0, %zmm1 ; AVX512F-NEXT: vmovapd %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovapd %zmm1, %zmm0 ; AVX512F-NEXT: retq %cmp = fcmp ogt <8 x double> %a0, %a1 %mask = bitcast <8 x i1> %cmp to i8 %not = xor i8 %mask, -1 %dec = sub i8 %mask, 1 %blsic = or i8 %not, %dec %sel = bitcast i8 %blsic to <8 x i1> %add = fadd <8 x double> %a0, %a1 %res = select <8 x i1> %sel, <8 x double> %a0, <8 x double> %add ret <8 x double> %res } ; T1MSKC - Inverse mask from trailing ones (~x | (x + 1)) define <32 x i16> @t1mskc_v32i16(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512-LABEL: t1mskc_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: leal 1(%rax), %ecx ; AVX512-NEXT: orl %eax, %ecx ; AVX512-NEXT: kmovd %ecx, %k1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: t1mskc_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k0 ; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: shll $16, %ecx ; AVX512F-NEXT: leal (%rax,%rcx), %edx ; AVX512F-NEXT: addl $1, %edx ; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax ; AVX512F-NEXT: orl %ecx, %eax ; AVX512F-NEXT: orl %eax, %edx ; AVX512F-NEXT: kmovw %edx, %k1 ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: kmovw %edx, %k2 ; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq %cmp = icmp sgt <32 x i16> %a0, %a1 %mask = bitcast <32 x i1> %cmp to i32 %not = xor i32 %mask, -1 %inc = add i32 %mask, 1 %t1mskc = or i32 %mask, %inc %sel = bitcast i32 %t1mskc to <32 x i1> %add = add <32 x i16> %a0, %a1 %res = select <32 x i1> %sel, <32 x i16> %a0, <32 x i16> %add ret <32 x i16> %res } ; TZMSK - Mask from trailing zeros (~x & (x - 1)) define <64 x i8> @tzmsk_v64i8(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512-LABEL: tzmsk_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 ; AVX512-NEXT: kmovq %k0, %rax ; AVX512-NEXT: leaq -1(%rax), %rcx ; AVX512-NEXT: andnq %rcx, %rax, %rax ; AVX512-NEXT: kmovq %rax, %k1 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq ; ; AVX512F-LABEL: tzmsk_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 ; AVX512F-NEXT: vpmovmskb %ymm5, %eax ; AVX512F-NEXT: vpmovmskb %ymm4, %ecx ; AVX512F-NEXT: shlq $32, %rcx ; AVX512F-NEXT: leaq (%rax,%rcx), %rdx ; AVX512F-NEXT: addq %rcx, %rax ; AVX512F-NEXT: addq $-1, %rax ; AVX512F-NEXT: andnq %rax, %rdx, %rax ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: movl %eax, %edx ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: shrq $32, %rax ; AVX512F-NEXT: shrq $48, %rcx ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: kmovw %edx, %k2 ; AVX512F-NEXT: kmovw %ecx, %k3 ; AVX512F-NEXT: kmovw %eax, %k4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 {%k4} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm4 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) ; AVX512F-NEXT: retq %cmp = icmp sgt <64 x i8> %a0, %a1 %mask = bitcast <64 x i1> %cmp to i64 %not = xor i64 %mask, -1 %dec = sub i64 %mask, 1 %tzmsk = and i64 %not, %dec %sel = bitcast i64 %tzmsk to <64 x i1> %add = add <64 x i8> %a0, %a1 %res = select <64 x i1> %sel, <64 x i8> %a0, <64 x i8> %add ret <64 x i8> %res }