diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2022-04-09 16:05:46 +0100 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2022-04-09 16:05:50 +0100 |
commit | 30a01bccda5c6e088927eb9d72aeb2dbccf4c45d (patch) | |
tree | ce6e5cb9c8a1a9abef2c8ad284f6b12c5f27fd36 | |
parent | f67e3f6e8c566928c88262df5664ac6e679753d2 (diff) | |
download | llvm-30a01bccda5c6e088927eb9d72aeb2dbccf4c45d.zip llvm-30a01bccda5c6e088927eb9d72aeb2dbccf4c45d.tar.gz llvm-30a01bccda5c6e088927eb9d72aeb2dbccf4c45d.tar.bz2 |
[X86] Fold concat(pshufb(x,y),pshufb(z,w)) -> pshufb(concat(x,z),concat(y,w))
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 25 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll | 40 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll | 48 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/x86-interleaved-access.ll | 92 |
4 files changed, 127 insertions, 78 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e27b5ab..f18162b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -53329,6 +53329,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, DAG.getTargetConstant(Idx, DL, MVT::i8)); } break; + case X86ISD::PSHUFB: + if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.useBWIRegs()))) { + return DAG.getNode(Op0.getOpcode(), DL, VT, + ConcatSubOperand(VT, Ops, 0), + ConcatSubOperand(VT, Ops, 1)); + } + break; case X86ISD::VPERMV3: if (!IsSplat && NumOps == 2 && VT.is512BitVector()) { MVT OpVT = Op0.getSimpleValueType(); @@ -53464,6 +53472,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } + // Attempt to fold target constant loads. + if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) { + SmallVector<APInt> EltBits; + APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements()); + for (unsigned I = 0, E = Ops.size(); I != E; ++I) { + APInt OpUndefElts; + SmallVector<APInt> OpEltBits; + if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts, + OpEltBits, true, false)) + break; + EltBits.append(OpEltBits); + UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth()); + } + if (EltBits.size() == VT.getVectorNumElements()) + return getConstVector(EltBits, UndefElts, VT, DAG, DL); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 89991e2..06d9cc0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -314,21 +314,23 @@ define void @store_i8_stride3_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX512-NEXT: vmovdqa %xmm1, (%rcx) -; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX512-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <16 x i8>, <16 x i8>* %in.vecptr0, align 32 %in.vec1 = load <16 x i8>, <16 x i8>* %in.vecptr1, align 32 @@ -544,13 +546,13 @@ define void @store_i8_stride3_vf32(<32 x i8>* %in.vecptr0, <32 x i8>* %in.vecptr ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa %ymm0, 64(%rcx) ; AVX512-NEXT: vmovdqu64 %zmm1, (%rcx) ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 005aa6b..2b71022 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -1493,13 +1493,13 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) { ; AVX512F-LABEL: PR54562_ref: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30] -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; @@ -1513,13 +1513,13 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) { ; AVX512DQ-LABEL: PR54562_ref: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30] -; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; @@ -1538,13 +1538,13 @@ define void @PR54562_mem(<64 x i8>* %src, <64 x i8>* %dst) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] +; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] -; AVX512F-NEXT: vmovdqa %xmm0, 48(%rsi) -; AVX512F-NEXT: vmovdqa %xmm1, 32(%rsi) -; AVX512F-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1561,13 +1561,13 @@ define void @PR54562_mem(<64 x i8>* %src, <64 x i8>* %dst) { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14] -; AVX512DQ-NEXT: vmovdqa %xmm0, 48(%rsi) -; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rsi) -; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rsi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index ad0bb4d..1ee33ae 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -972,24 +972,47 @@ ret void } define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) { -; AVX-LABEL: interleaved_store_vf16_i8_stride3: -; AVX: # %bb.0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm1, (%rdi) -; AVX-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX-NEXT: retq +; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride3: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1OR2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1OR2-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX1OR2-NEXT: vmovdqu %xmm1, (%rdi) +; AVX1OR2-NEXT: vmovdqu %xmm2, 32(%rdi) +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: interleaved_store_vf16_i8_stride3: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqu %xmm0, 32(%rdi) +; AVX512-NEXT: vmovdqu %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47> @@ -1069,13 +1092,13 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi) ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) ; AVX512-NEXT: vzeroupper @@ -1209,26 +1232,25 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52] ; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3] ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm7 -; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6 ; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 +; AVX512-NEXT: vpshufb %zmm4, %zmm2, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-NEXT: vpshufb %zmm4, %zmm5, %zmm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi) -; AVX512-NEXT: vmovdqu64 %zmm3, 64(%rdi) +; AVX512-NEXT: vmovdqu64 %zmm4, 64(%rdi) ; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq |