aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll')
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll393
1 files changed, 268 insertions, 125 deletions
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 257c9a2..7bb3874 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -572,45 +572,45 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
-; AVX512-FAST-LABEL: store_i8_stride8_vf8:
-; AVX512-FAST: # %bb.0:
-; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
-; AVX512-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7]
-; AVX512-FAST-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3
-; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15>
-; AVX512-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2
-; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u>
-; AVX512-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2
-; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
-; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
-; AVX512-FAST-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1
-; AVX512-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX512-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512-FAST-NEXT: vzeroupper
-; AVX512-FAST-NEXT: retq
+; AVX512F-FAST-LABEL: store_i8_stride8_vf8:
+; AVX512F-FAST: # %bb.0:
+; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7]
+; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3
+; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2
+; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
+; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512F-FAST-NEXT: vzeroupper
+; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: store_i8_stride8_vf8:
; AVX512BW-SLOW: # %bb.0:
@@ -629,32 +629,61 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u,22,30,u,u,u,u,u,u,23,31]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,22,30,u,u,u,u,u,u,23,31,u,u]
-; AVX512BW-SLOW-NEXT: movw $17544, %cx # imm = 0x4488
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512BW-SLOW-NEXT: movl $287445282, %ecx # imm = 0x11221122
; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1
-; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1}
-; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,u,u,22,30,u,u,u,u,u,u,23,31,u,u,u,u]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,22,30,u,u,u,u,u,u,23,31,u,u,u,u,u,u]
-; AVX512BW-SLOW-NEXT: movw $4386, %cx # imm = 0x1122
-; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2
-; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm4 {%k2}
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
-; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27]
-; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u]
-; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm1 {%k1}
-; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u]
-; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u]
-; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm0 {%k2}
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,0,1,2,3,0,1]
+; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3,0,1,2,3]
+; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512BW-SLOW-NEXT: movl $1149781128, %ecx # imm = 0x44884488
+; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1
+; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1}
+; AVX512BW-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA
+; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1
+; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512BW-SLOW-NEXT: vzeroupper
; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: store_i8_stride8_vf8:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
+; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,10,12,14,8,10,12,14,9,11,13,15,9,11,13,15]
+; AVX512BW-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm1
+; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63]
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,0,2,4,6,1,3,5,7,1,3,5,7]
+; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,18,22,26,30,u,u,u,u,19,23,27,31,u,u,u,u,32,36,40,44,u,u,u,u,33,37,41,45,u,u,u,u,50,54,58,62,u,u,u,u,51,55,59,63,u,u,u,u]
+; AVX512BW-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA
+; AVX512BW-FAST-NEXT: kmovd %ecx, %k1
+; AVX512BW-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
%in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64
%in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64
@@ -1051,69 +1080,182 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
-; AVX512-LABEL: store_i8_stride8_vf16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512-NEXT: vmovdqa (%r8), %xmm2
-; AVX512-NEXT: vmovdqa (%r11), %xmm3
-; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
-; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
-; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
-; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
-; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm6
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
-; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
-; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = <4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11>
-; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u>
-; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u>
-; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12
-; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
-; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm5
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
-; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm6
-; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
-; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm6
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
-; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm8
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
-; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
-; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX512-NEXT: vpshufb %ymm15, %ymm0, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: store_i8_stride8_vf16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512F-NEXT: vmovdqa (%r8), %xmm2
+; AVX512F-NEXT: vmovdqa (%r11), %xmm3
+; AVX512F-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512F-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512F-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
+; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm6
+; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512F-NEXT: vpshufb %ymm8, %ymm7, %ymm9
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512F-NEXT: vpshufb %ymm10, %ymm9, %ymm11
+; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %ymm13, %ymm12, %ymm14
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11>
+; AVX512F-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u>
+; AVX512F-NEXT: vpshufb %ymm14, %ymm7, %ymm7
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u>
+; AVX512F-NEXT: vpshufb %ymm7, %ymm9, %ymm9
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %ymm15, %ymm12, %ymm12
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3]
+; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm5
+; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512F-NEXT: vpshufb %ymm8, %ymm2, %ymm6
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
+; AVX512F-NEXT: vpshufb %ymm10, %ymm1, %ymm6
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
+; AVX512F-NEXT: vpshufb %ymm13, %ymm0, %ymm8
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+; AVX512F-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512F-NEXT: vpshufb %ymm14, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15]
+; AVX512F-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm15, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax)
+; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-SLOW-LABEL: store_i8_stride8_vf16:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-SLOW-NEXT: vmovdqa (%r11), %xmm3
+; AVX512BW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
+; AVX512BW-SLOW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6]
+; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512BW-SLOW-NEXT: vpshufb %zmm5, %zmm4, %zmm4
+; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm3[0,2,0,2,4,6,4,6]
+; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512BW-SLOW-NEXT: vpshufb %zmm7, %zmm6, %zmm6
+; AVX512BW-SLOW-NEXT: movl $8913032, %ecx # imm = 0x880088
+; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1
+; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm6, %zmm4 {%k1}
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6]
+; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpshufb %zmm8, %zmm6, %zmm6
+; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm1[0,2,0,2,4,6,4,6]
+; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512BW-SLOW-NEXT: vpshufb %zmm10, %zmm9, %zmm9
+; AVX512BW-SLOW-NEXT: movl $2228258, %ecx # imm = 0x220022
+; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2
+; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm9, %zmm6 {%k2}
+; AVX512BW-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA
+; AVX512BW-SLOW-NEXT: kmovd %ecx, %k3
+; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k3}
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
+; AVX512BW-SLOW-NEXT: vpshufb %zmm5, %zmm2, %zmm2
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[1,3,1,3,5,7,5,7]
+; AVX512BW-SLOW-NEXT: vpshufb %zmm7, %zmm3, %zmm3
+; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1}
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
+; AVX512BW-SLOW-NEXT: vpshufb %zmm8, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
+; AVX512BW-SLOW-NEXT: vpshufb %zmm10, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: store_i8_stride8_vf16:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm1
+; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm2
+; AVX512BW-FAST-NEXT: vmovdqa (%r11), %xmm3
+; AVX512BW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
+; AVX512BW-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
+; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4
+; AVX512BW-FAST-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
+; AVX512BW-FAST-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
+; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14]
+; AVX512BW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm3, %zmm3
+; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm2[0,2,0,2,4,6,4,6]
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm8, %zmm8
+; AVX512BW-FAST-NEXT: movl $8913032, %ecx # imm = 0x880088
+; AVX512BW-FAST-NEXT: kmovd %ecx, %k1
+; AVX512BW-FAST-NEXT: vmovdqu16 %zmm3, %zmm8 {%k1}
+; AVX512BW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6]
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT: vpshufb %zmm10, %zmm6, %zmm6
+; AVX512BW-FAST-NEXT: movl $2228258, %ecx # imm = 0x220022
+; AVX512BW-FAST-NEXT: kmovd %ecx, %k2
+; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm6 {%k2}
+; AVX512BW-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA
+; AVX512BW-FAST-NEXT: kmovd %ecx, %k3
+; AVX512BW-FAST-NEXT: vmovdqa32 %zmm8, %zmm6 {%k3}
+; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
+; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FAST-NEXT: vpermq %zmm5, %zmm1, %zmm5
+; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm5, %zmm5
+; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
+; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm2, %zmm2
+; AVX512BW-FAST-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1}
+; AVX512BW-FAST-NEXT: vpermq %zmm4, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
+; AVX512BW-FAST-NEXT: vpshufb %zmm10, %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512BW-FAST-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
%in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
%in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64
@@ -6988,6 +7130,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX1: {{.*}}
+; AVX512: {{.*}}
+; AVX512-FAST: {{.*}}
; AVX512-SLOW: {{.*}}
; AVX512BW: {{.*}}
; AVX512BW-ONLY: {{.*}}
@@ -6997,7 +7141,6 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-ONLY: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
-; AVX512F: {{.*}}
; AVX512F-ONLY: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}