7 files changed, 101 insertions, 102 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cbaca28..a4e9455 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7598,11 +7598,22 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   int NumV2Elements =
       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
 
-  if (NumV2Elements == 0)
+  if (NumV2Elements == 0) {
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
+    // We coerce the shuffle pattern to be compatible with UNPCK instructions
+    // but we aren't actually going to use the UNPCK instruction because doing
+    // so prevents folding a load into this instruction or making a copy.
+    const int UnpackLoMask[] = {0, 0, 1, 1};
+    const int UnpackHiMask[] = {2, 2, 3, 3};
+    if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+      Mask = UnpackLoMask;
+    else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+      Mask = UnpackHiMask;
+
     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
+  }
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
@@ -19347,86 +19358,75 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
   bool FloatDomain = VT.isFloatingPoint();
 
   // For floating point shuffles, we don't have free copies in the shuffle
-  // instructions, so this always makes sense to canonicalize.
+  // instructions or the ability to load as part of the instruction, so
+  // canonicalize their shuffles to UNPCK or MOV variants.
   //
-  // For integer shuffles, if we don't have access to VEX encodings, the generic
-  // PSHUF instructions are preferable to some of the specialized forms despite
-  // requiring one more byte to encode because they can implicitly copy.
-  //
-  // IF we *do* have VEX encodings, then we can use shorter, more specific
-  // shuffle instructions freely as they can copy due to the extra register
-  // operand.
-  if (FloatDomain || Subtarget->hasAVX()) {
-    // We have both floating point and integer variants of shuffles that dup
-    // either the low or high half of the vector.
-    if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
-      bool Lo = Mask.equals(0, 0);
-      unsigned Shuffle;
-      MVT ShuffleVT;
-      // If the input is a floating point, check if we have SSE3 which will let
-      // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
-      // option to fold the input operand into even an unaligned memory load.
-      if (FloatDomain && Lo && Subtarget->hasSSE3()) {
-        Shuffle = X86ISD::MOVDDUP;
-        ShuffleVT = MVT::v2f64;
-      } else if (FloatDomain) {
-        // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
-        // than the UNPCK variants.
-        Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
-        ShuffleVT = MVT::v4f32;
-      } else if (Subtarget->hasSSE2()) {
-        // We model everything else using UNPCK instructions. While MOVLHPS and
-        // MOVHLPS are shorter encodings they cannot accept a memory operand
-        // which overly constrains subsequent lowering.
-        Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
-        ShuffleVT = MVT::v2i64;
-      } else {
-        // No available instructions here.
-        return false;
-      }
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
-        return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
-      if (Shuffle == X86ISD::MOVDDUP)
-        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
-      else
-        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
-                    /*AddTo*/ true);
-      return true;
-    }
-
-    // FIXME: We should match UNPCKLPS and UNPCKHPS here.
-
-    // For the integer domain we have specialized instructions for duplicating
-    // any element size from the low or high half.
-    if (!FloatDomain &&
-        (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3) ||
-         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
-         Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
-         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
-         Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
-                     15))) {
-      bool Lo = Mask[0] == 0;
-      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
-        return false; // Nothing to do!
-      MVT ShuffleVT;
-      switch (Mask.size()) {
-      case 4: ShuffleVT = MVT::v4i32; break;
-      case 8: ShuffleVT = MVT::v8i16; break;
-      case 16: ShuffleVT = MVT::v16i8; break;
-      };
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
+  // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
+  // vectors because it can have a load folded into it that UNPCK cannot. This
+  // doesn't preclude something switching to the shorter encoding post-RA.
+  if (FloatDomain && (Mask.equals(0, 0) || Mask.equals(1, 1))) {
+    bool Lo = Mask.equals(0, 0);
+    unsigned Shuffle;
+    MVT ShuffleVT;
+    // Check if we have SSE3 which will let us use MOVDDUP. That instruction
+    // is no slower than UNPCKLPD but has the option to fold the input operand
+    // into even an unaligned memory load.
+    if (Lo && Subtarget->hasSSE3()) {
+      Shuffle = X86ISD::MOVDDUP;
+      ShuffleVT = MVT::v2f64;
+    } else {
+      // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+      // than the UNPCK variants.
+      Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+      ShuffleVT = MVT::v4f32;
+    }
+    if (Depth == 1 && Root->getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    if (Shuffle == X86ISD::MOVDDUP)
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+    else
       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
-                    /*AddTo*/ true);
-      return true;
-    }
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // FIXME: We should match UNPCKLPS and UNPCKHPS here.
+
+  // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
+  // variants as none of these have single-instruction variants that are
+  // superior to the UNPCK formulation.
+  if (!FloatDomain &&
+      (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
+       Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
+       Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
+       Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
+                   15))) {
+    bool Lo = Mask[0] == 0;
+    unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+    if (Depth == 1 && Root->getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    MVT ShuffleVT;
+    switch (Mask.size()) {
+    case 8:
+      ShuffleVT = MVT::v8i16;
+      break;
+    case 16:
+      ShuffleVT = MVT::v16i8;
+      break;
+    default:
+      llvm_unreachable("Impossible mask size!");
+    };
+    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
   }
 
   // Don't try to re-form single instruction chains under any circumstances now
diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll
index ca54022..a8dae82 100644
--- a/llvm/test/CodeGen/X86/avx-basic.ll
+++ b/llvm/test/CodeGen/X86/avx-basic.ll
@@ -72,9 +72,9 @@ entry:
   ret <4 x i64> %shuffle
 }
 
-; CHECK: vpunpcklqdq
+; CHECK: vmovlhps
 ; CHECK-NEXT: vextractf128  $1
-; CHECK-NEXT: vpunpcklqdq
+; CHECK-NEXT: vmovlhps
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
 entry:
@@ -83,7 +83,7 @@ entry:
 }
 
 ; CHECK: vpshufd $-96
-; CHECK: vpunpckhdq
+; CHECK: vpshufd $-6
 ; CHECK: vinsertf128 $1
 define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
 entry:
diff --git a/llvm/test/CodeGen/X86/avx-sext.ll b/llvm/test/CodeGen/X86/avx-sext.ll
index 9bcf06f..fb2287f 100644
--- a/llvm/test/CodeGen/X86/avx-sext.ll
+++ b/llvm/test/CodeGen/X86/avx-sext.ll
@@ -156,7 +156,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
 
 ; AVX-LABEL: sext_16i8_to_16i16
 ; AVX: vpmovsxbw
-; AVX: vpunpckhqdq
+; AVX: vmovhlps
 ; AVX: vpmovsxbw
 ; AVX: ret
 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll
index a2537ce..058db31 100644
--- a/llvm/test/CodeGen/X86/avx-splat.ll
+++ b/llvm/test/CodeGen/X86/avx-splat.ll
@@ -19,7 +19,7 @@ entry:
 }
 
 ; CHECK: vmovq
-; CHECK-NEXT: vpunpcklqdq %xmm
+; CHECK-NEXT: vmovlhps %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
 entry:
diff --git a/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll b/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll
index f4539c8..ab79495 100644
--- a/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -95,8 +95,8 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %
 ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
 ; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
 ; vpand and there is nothing more you can do to match vmaxpd.
-; CHECK: vpunpcklqdq
-; CHECK: vpand
+; CHECK: vmovlhps
+; CHECK: vandps
 ; CHECK: vmaxpd
 ; CHECK: ret
 define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 8faa3f0..f959fea 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -639,7 +639,7 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; ALL-LABEL: @shuffle_v8i16_XXXdXXXX
 ; ALL:       # BB#0:
-; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[0,2,2,3]
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,2,3,3]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index cd79a38..affa9337 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown"
 define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_0001
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@@ -18,7 +18,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@@ -41,7 +41,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@@ -52,7 +52,7 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_1000
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@@ -63,8 +63,8 @@ define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_2200
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@@ -76,7 +76,7 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0]
-; AVX1-NEXT:    vpunpckhqdq {{.*}} # xmm1 = xmm1[1,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
@@ -281,7 +281,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_0124
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -292,7 +292,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_0142
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm2[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -304,7 +304,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -316,7 +316,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -336,7 +336,7 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -356,7 +356,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -368,8 +368,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @stress_test1
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1]
-; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0