aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2020-07-04 13:54:30 +0100
committerSimon Pilgrim <llvm-dev@redking.me.uk>2020-07-04 13:54:30 +0100
commit71f342d6c3d82fa45c180f1981710bb6092d39fc (patch)
treea688cb0d930ee6c98d9216db876c184eb6accfed
parent07d4d84676a94741a226e254a1d536db99c7dac0 (diff)
downloadllvm-71f342d6c3d82fa45c180f1981710bb6092d39fc.zip
llvm-71f342d6c3d82fa45c180f1981710bb6092d39fc.tar.gz
llvm-71f342d6c3d82fa45c180f1981710bb6092d39fc.tar.bz2
[X86][AVX] Fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) -> SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X)))
Using PACK for truncations leaves us with intermediate shuffles that can be tricky to remove while the truncation tree is being formed. This fold helps pull out the PERMQ case which is one of the most common, avoiding some costly lane-crossing shuffles. A future patch will begin adding more general shuffle folding, which we should be able to use for HADD/HSUB as well.
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp63
-rw-r--r--llvm/test/CodeGen/X86/bitcast-setcc-512.ll6
-rw-r--r--llvm/test/CodeGen/X86/bitcast-vector-bool.ll2
-rw-r--r--llvm/test/CodeGen/X86/masked_expandload.ll2
-rw-r--r--llvm/test/CodeGen/X86/masked_store_trunc.ll4
-rw-r--r--llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll8
-rw-r--r--llvm/test/CodeGen/X86/masked_store_trunc_usat.ll8
-rw-r--r--llvm/test/CodeGen/X86/movmsk-cmp.ll12
-rw-r--r--llvm/test/CodeGen/X86/psubus.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-compare-results.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-and-bool.ll1
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-or-bool.ll3
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-packus.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-ssat.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc-usat.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-trunc.ll4
-rw-r--r--llvm/test/CodeGen/X86/vselect-packss.ll4
18 files changed, 105 insertions, 52 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1146b61..edc6ce1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5588,6 +5588,36 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask) {
return canWidenShuffleElements(Mask, WidenedMask);
}
+// Attempt to narrow/widen shuffle mask until it matches the target number of
+// elements.
+static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
+ SmallVectorImpl<int> &ScaledMask) {
+ unsigned NumSrcElts = Mask.size();
+ assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
+ "Illegal shuffle scale factor");
+
+ // Narrowing is guaranteed to work.
+ if (NumDstElts >= NumSrcElts) {
+ int Scale = NumDstElts / NumSrcElts;
+ llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
+ return true;
+ }
+
+ // We have to repeat the widening until we reach the target size, but we can
+ // split out the first widening as it sets up ScaledMask for us.
+ if (canWidenShuffleElements(Mask, ScaledMask)) {
+ while (ScaledMask.size() > NumDstElts) {
+ SmallVector<int, 16> WidenedMask;
+ if (!canWidenShuffleElements(ScaledMask, WidenedMask))
+ return false;
+ ScaledMask = std::move(WidenedMask);
+ }
+ return true;
+ }
+
+ return false;
+}
+
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -41763,6 +41793,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ unsigned NumDstElts = VT.getVectorNumElements();
unsigned DstBitsPerElt = VT.getScalarSizeInBits();
unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
@@ -41779,7 +41810,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumDstElts = VT.getVectorNumElements();
unsigned NumSrcElts = NumDstElts / 2;
unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
@@ -41826,6 +41856,37 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
}
+ // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
+ // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
+ // truncation trees that help us avoid lane crossing shuffles.
+ // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
+ if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getConstantOperandAPInt(1) == 0 &&
+ N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
+ N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
+ N0.getOperand(0).getValueType().is256BitVector()) {
+ // TODO - support target/faux shuffles.
+ SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
+ // To keep the PACK LHS/RHS coherency, we must be able to scale the unary
+ // shuffle to a vXi64 width - we can probably relax this in the future.
+ SmallVector<int, 4> ShuffleMask;
+ if (SVN->getOperand(1).isUndef() &&
+ scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
+ SDLoc DL(N);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
+ Lo = DAG.getBitcast(N0.getValueType(), Lo);
+ Hi = DAG.getBitcast(N1.getValueType(), Hi);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
+ Res = DAG.getBitcast(MVT::v4i32, Res);
+ Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+ }
+
// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
// truncate to create a larger truncate.
if (Subtarget.hasAVX512() &&
diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll
index efe0431..2a66736 100644
--- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll
+++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -114,9 +114,9 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b) {
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: vzeroupper
@@ -175,9 +175,9 @@ define i16 @v16f32(<16 x float> %a, <16 x float> %b) {
; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: vzeroupper
@@ -573,9 +573,9 @@ define void @bitcast_16i32_store(i16* %p, <16 x i32> %a0) {
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: movw %ax, (%rdi)
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index a862f73..faccb81 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -409,9 +409,9 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm0, %ecx
; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: shrl $8, %eax
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index a66f5a3..7d9e841 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -873,9 +873,9 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX2-NEXT: vpcmpeqd %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpcmpeqd %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpackssdw %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm4, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne LBB3_1
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 51e0c25..341a349 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -2377,9 +2377,9 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32>
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne .LBB9_1
@@ -3058,9 +3058,9 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne .LBB10_1
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 51a9d92..c120684 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -782,9 +782,9 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
; AVX2-NEXT: notl %eax
@@ -3290,9 +3290,9 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32>
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne .LBB9_1
@@ -3952,18 +3952,18 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne .LBB10_1
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index f90d001..0160733 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -655,9 +655,9 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask
; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vmovmskps %ymm1, %eax
; AVX2-NEXT: notl %eax
@@ -2958,9 +2958,9 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32>
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne .LBB9_1
@@ -3658,18 +3658,18 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m
; AVX2-NEXT: vpminud %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpminud %ymm5, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm1, %eax
; AVX2-NEXT: testb $1, %al
; AVX2-NEXT: jne .LBB10_1
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 288069b..0870650 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -706,7 +706,6 @@ define i1 @allones_v16i32_sign(<16 x i32> %arg) {
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
@@ -767,9 +766,8 @@ define i1 @allzeros_v16i32_sign(<16 x i32> %arg) {
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %ymm0, %eax
-; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX2-NEXT: testl %eax, %eax
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -2149,7 +2147,6 @@ define i1 @allones_v16i32_and1(<16 x i32> %arg) {
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
@@ -2211,9 +2208,8 @@ define i1 @allzeros_v16i32_and1(<16 x i32> %arg) {
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %ymm0, %eax
-; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX2-NEXT: testl %eax, %eax
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -3522,7 +3518,6 @@ define i1 @allones_v16i32_and4(<16 x i32> %arg) {
; AVX2-NEXT: vpslld $29, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
@@ -3584,9 +3579,8 @@ define i1 @allzeros_v16i32_and4(<16 x i32> %arg) {
; AVX2-NEXT: vpslld $29, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %ymm0, %eax
-; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX2-NEXT: testl %eax, %eax
; AVX2-NEXT: sete %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 6e01c60..28ab2e1 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -1732,9 +1732,9 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1
; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll
index d0eccaa..80cfa36 100644
--- a/llvm/test/CodeGen/X86/vector-compare-results.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-results.ll
@@ -410,9 +410,9 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -475,9 +475,9 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -589,9 +589,9 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -657,9 +657,9 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -946,9 +946,9 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1243,9 +1243,9 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index b79689b..9812576 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -1506,7 +1506,6 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) {
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index be93622..3491839 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -1497,9 +1497,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) {
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %ymm0, %eax
-; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA
+; AVX2-NEXT: testl %eax, %eax
; AVX2-NEXT: setne %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index ed47482..627faa0 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -1710,9 +1710,9 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>) {
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $8, %ecx
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index 9d6e582..fe16450 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -2234,9 +2234,9 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64>* %p0) "min-legal-vector-wid
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5027,9 +5027,9 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64>* %p0) "min-legal-vector-w
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5457,9 +5457,9 @@ define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p0) "min-legal-vector-w
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5513,9 +5513,9 @@ define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p0, <16 x i8>* %p1) "m
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 0d9f986..442d903 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -2216,9 +2216,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64>* %p0) "min-legal-vector-width
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -4829,9 +4829,9 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64>* %p0) "min-legal-vector-wid
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5232,9 +5232,9 @@ define <16 x i8> @trunc_ssat_v16i32_v16i8(<16 x i32>* %p0) "min-legal-vector-wid
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -5288,9 +5288,9 @@ define void @trunc_ssat_v16i32_v16i8_store(<16 x i32>* %p0, <16 x i8>* %p1) "min
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index a6e3e9f..2e8f3bb 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -1541,9 +1541,9 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64>* %p0) {
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -3537,9 +3537,9 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64>* %p0) {
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -4086,9 +4086,9 @@ define <16 x i8> @trunc_usat_v16i32_v16i8(<16 x i32>* %p0) {
; AVX2-NEXT: vpminud 32(%rdi), %ymm0, %ymm1
; AVX2-NEXT: vpminud (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -4230,9 +4230,9 @@ define void @trunc_usat_v16i32_v16i8_store(<16 x i32>* %p0, <16 x i8>* %p1) {
; AVX2-NEXT: vpminud 32(%rdi), %ymm0, %ymm1
; AVX2-NEXT: vpminud (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index b0a93a8..a5f6be5 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -927,9 +927,9 @@ define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1
; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1004,9 +1004,9 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll
index 34b336c..966642d54 100644
--- a/llvm/test/CodeGen/X86/vselect-packss.ll
+++ b/llvm/test/CodeGen/X86/vselect-packss.ll
@@ -137,9 +137,9 @@ define <16 x i8> @vselect_packss_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i8
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -283,9 +283,9 @@ define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
; AVX2-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0