diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2020-07-04 13:54:30 +0100 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2020-07-04 13:54:30 +0100 |
commit | 71f342d6c3d82fa45c180f1981710bb6092d39fc (patch) | |
tree | a688cb0d930ee6c98d9216db876c184eb6accfed | |
parent | 07d4d84676a94741a226e254a1d536db99c7dac0 (diff) | |
download | llvm-71f342d6c3d82fa45c180f1981710bb6092d39fc.zip llvm-71f342d6c3d82fa45c180f1981710bb6092d39fc.tar.gz llvm-71f342d6c3d82fa45c180f1981710bb6092d39fc.tar.bz2 |
[X86][AVX] Fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) -> SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X)))
Using PACK for truncations leaves us with intermediate shuffles that can be tricky to remove while the truncation tree is being formed.
This fold helps pull out the PERMQ case which is one of the most common, avoiding some costly lane-crossing shuffles.
A future patch will begin adding more general shuffle folding, which we should be able to use for HADD/HSUB as well.
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 63 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/bitcast-setcc-512.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/bitcast-vector-bool.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_expandload.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_store_trunc.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_store_trunc_usat.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/movmsk-cmp.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/psubus.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-compare-results.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-and-bool.ll | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-or-bool.ll | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc-packus.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc-usat.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-trunc.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vselect-packss.ll | 4 |
18 files changed, 105 insertions, 52 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1146b61..edc6ce1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5588,6 +5588,36 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask) { return canWidenShuffleElements(Mask, WidenedMask); } +// Attempt to narrow/widen shuffle mask until it matches the target number of +// elements. +static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts, + SmallVectorImpl<int> &ScaledMask) { + unsigned NumSrcElts = Mask.size(); + assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && + "Illegal shuffle scale factor"); + + // Narrowing is guaranteed to work. + if (NumDstElts >= NumSrcElts) { + int Scale = NumDstElts / NumSrcElts; + llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask); + return true; + } + + // We have to repeat the widening until we reach the target size, but we can + // split out the first widening as it sets up ScaledMask for us. + if (canWidenShuffleElements(Mask, ScaledMask)) { + while (ScaledMask.size() > NumDstElts) { + SmallVector<int, 16> WidenedMask; + if (!canWidenShuffleElements(ScaledMask, WidenedMask)) + return false; + ScaledMask = std::move(WidenedMask); + } + return true; + } + + return false; +} + /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); @@ -41763,6 +41793,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + unsigned NumDstElts = VT.getVectorNumElements(); unsigned DstBitsPerElt = VT.getScalarSizeInBits(); unsigned SrcBitsPerElt = 2 * DstBitsPerElt; assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt && @@ -41779,7 +41810,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumDstElts = VT.getVectorNumElements(); unsigned NumSrcElts = NumDstElts / 2; unsigned NumDstEltsPerLane = NumDstElts / NumLanes; unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; @@ -41826,6 +41856,37 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); } + // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) + // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for + // truncation trees that help us avoid lane crossing shuffles. + // TODO: There's a lot more we can do for PACK/HADD style shuffle combines. + if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getConstantOperandAPInt(1) == 0 && + N1.getConstantOperandAPInt(1) == (NumDstElts / 2) && + N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() && + N0.getOperand(0).getValueType().is256BitVector()) { + // TODO - support target/faux shuffles. + SDValue Vec = peekThroughBitcasts(N0.getOperand(0)); + if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) { + // To keep the PACK LHS/RHS coherency, we must be able to scale the unary + // shuffle to a vXi64 width - we can probably relax this in the future. + SmallVector<int, 4> ShuffleMask; + if (SVN->getOperand(1).isUndef() && + scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) { + SDLoc DL(N); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL); + Lo = DAG.getBitcast(N0.getValueType(), Lo); + Hi = DAG.getBitcast(N1.getValueType(), Hi); + SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); + Res = DAG.getBitcast(MVT::v4i32, Res); + Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask); + return DAG.getBitcast(VT, Res); + } + } + } + // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular // truncate to create a larger truncate. if (Subtarget.hasAVX512() && diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll index efe0431..2a66736 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll @@ -114,9 +114,9 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b) { ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -175,9 +175,9 @@ define i16 @v16f32(<16 x float> %a, <16 x float> %b) { ; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -573,9 +573,9 @@ define void @bitcast_16i32_store(i16* %p, <16 x i32> %a0) { ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: movw %ax, (%rdi) ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index a862f73..faccb81 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -409,9 +409,9 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %ecx ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $8, %eax diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index a66f5a3..7d9e841 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -873,9 +873,9 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm4, %ymm4 ; AVX2-NEXT: vpackssdw %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm4, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne LBB3_1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 51e0c25..341a349 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -2377,9 +2377,9 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32> ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB9_1 @@ -3058,9 +3058,9 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB10_1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index 51a9d92..c120684 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -782,9 +782,9 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 ; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax ; AVX2-NEXT: notl %eax @@ -3290,9 +3290,9 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32> ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB9_1 @@ -3952,18 +3952,18 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB10_1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index f90d001..0160733 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -655,9 +655,9 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax ; AVX2-NEXT: notl %eax @@ -2958,9 +2958,9 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32> ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB9_1 @@ -3658,18 +3658,18 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m ; AVX2-NEXT: vpminud %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpminud %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB10_1 diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 288069b..0870650 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -706,7 +706,6 @@ define i1 @allones_v16i32_sign(<16 x i32> %arg) { ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax @@ -767,9 +766,8 @@ define i1 @allzeros_v16i32_sign(<16 x i32> %arg) { ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2149,7 +2147,6 @@ define i1 @allones_v16i32_and1(<16 x i32> %arg) { ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax @@ -2211,9 +2208,8 @@ define i1 @allzeros_v16i32_and1(<16 x i32> %arg) { ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3522,7 +3518,6 @@ define i1 @allones_v16i32_and4(<16 x i32> %arg) { ; AVX2-NEXT: vpslld $29, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax @@ -3584,9 +3579,8 @@ define i1 @allzeros_v16i32_and4(<16 x i32> %arg) { ; AVX2-NEXT: vpslld $29, %ymm0, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 6e01c60..28ab2e1 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1732,9 +1732,9 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index d0eccaa..80cfa36 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -410,9 +410,9 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -475,9 +475,9 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { ; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -589,9 +589,9 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -657,9 +657,9 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -946,9 +946,9 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1243,9 +1243,9 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index b79689b..9812576 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -1506,7 +1506,6 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) { ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index be93622..3491839 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -1497,9 +1497,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) { ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index ed47482..627faa0 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -1710,9 +1710,9 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) { ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %ecx diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 9d6e582..fe16450 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -2234,9 +2234,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64>* %p0) "min-legal-vector-wid ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5027,9 +5027,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64>* %p0) "min-legal-vector-w ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5457,9 +5457,9 @@ define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p0) "min-legal-vector-w ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5513,9 +5513,9 @@ define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p0, <16 x i8>* %p1) "m ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 0d9f986..442d903 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -2216,9 +2216,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64>* %p0) "min-legal-vector-width ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4829,9 +4829,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64>* %p0) "min-legal-vector-wid ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5232,9 +5232,9 @@ define <16 x i8> @trunc_ssat_v16i32_v16i8(<16 x i32>* %p0) "min-legal-vector-wid ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5288,9 +5288,9 @@ define void @trunc_ssat_v16i32_v16i8_store(<16 x i32>* %p0, <16 x i8>* %p1) "min ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index a6e3e9f..2e8f3bb 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -1541,9 +1541,9 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64>* %p0) { ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3537,9 +3537,9 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64>* %p0) { ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4086,9 +4086,9 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(<16 x i32>* %p0) { ; AVX2-NEXT: vpminud 32(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vpminud (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4230,9 +4230,9 @@ define void @trunc_usat_v16i32_v16i8_store(<16 x i32>* %p0, <16 x i8>* %p1) { ; AVX2-NEXT: vpminud 32(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vpminud (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index b0a93a8..a5f6be5 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -927,9 +927,9 @@ define void @trunc16i32_16i8_ashr(<16 x i32> %a) { ; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1004,9 +1004,9 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) { ; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll index 34b336c..966642d54 100644 --- a/llvm/test/CodeGen/X86/vselect-packss.ll +++ b/llvm/test/CodeGen/X86/vselect-packss.ll @@ -137,9 +137,9 @@ define <16 x i8> @vselect_packss_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i8 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -283,9 +283,9 @@ define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1 ; AVX2-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 |