diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 158 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx-basic.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx-sext.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx-splat.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/exedepsfix-broadcast.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll | 29 |
7 files changed, 101 insertions, 102 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index cbaca28..a4e9455 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7598,11 +7598,22 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); - if (NumV2Elements == 0) + if (NumV2Elements == 0) { // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. + // We coerce the shuffle pattern to be compatible with UNPCK instructions + // but we aren't actually going to use the UNPCK instruction because doing + // so prevents folding a load into this instruction or making a copy. + const int UnpackLoMask[] = {0, 0, 1, 1}; + const int UnpackHiMask[] = {2, 2, 3, 3}; + if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) + Mask = UnpackLoMask; + else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) + Mask = UnpackHiMask; + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + } // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) @@ -19347,86 +19358,75 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, bool FloatDomain = VT.isFloatingPoint(); // For floating point shuffles, we don't have free copies in the shuffle - // instructions, so this always makes sense to canonicalize. + // instructions or the ability to load as part of the instruction, so + // canonicalize their shuffles to UNPCK or MOV variants. // - // For integer shuffles, if we don't have access to VEX encodings, the generic - // PSHUF instructions are preferable to some of the specialized forms despite - // requiring one more byte to encode because they can implicitly copy. - // - // IF we *do* have VEX encodings, then we can use shorter, more specific - // shuffle instructions freely as they can copy due to the extra register - // operand. - if (FloatDomain || Subtarget->hasAVX()) { - // We have both floating point and integer variants of shuffles that dup - // either the low or high half of the vector. - if (Mask.equals(0, 0) || Mask.equals(1, 1)) { - bool Lo = Mask.equals(0, 0); - unsigned Shuffle; - MVT ShuffleVT; - // If the input is a floating point, check if we have SSE3 which will let - // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the - // option to fold the input operand into even an unaligned memory load. - if (FloatDomain && Lo && Subtarget->hasSSE3()) { - Shuffle = X86ISD::MOVDDUP; - ShuffleVT = MVT::v2f64; - } else if (FloatDomain) { - // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller - // than the UNPCK variants. - Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; - ShuffleVT = MVT::v4f32; - } else if (Subtarget->hasSSE2()) { - // We model everything else using UNPCK instructions. While MOVLHPS and - // MOVHLPS are shorter encodings they cannot accept a memory operand - // which overly constrains subsequent lowering. - Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; - ShuffleVT = MVT::v2i64; - } else { - // No available instructions here. - return false; - } - if (Depth == 1 && Root->getOpcode() == Shuffle) - return false; // Nothing to do! - Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); - DCI.AddToWorklist(Op.getNode()); - if (Shuffle == X86ISD::MOVDDUP) - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); - else - Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), - /*AddTo*/ true); - return true; - } - - // FIXME: We should match UNPCKLPS and UNPCKHPS here. - - // For the integer domain we have specialized instructions for duplicating - // any element size from the low or high half. - if (!FloatDomain && - (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3) || - Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || - Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, - 15))) { - bool Lo = Mask[0] == 0; - unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; - if (Depth == 1 && Root->getOpcode() == Shuffle) - return false; // Nothing to do! - MVT ShuffleVT; - switch (Mask.size()) { - case 4: ShuffleVT = MVT::v4i32; break; - case 8: ShuffleVT = MVT::v8i16; break; - case 16: ShuffleVT = MVT::v16i8; break; - }; - Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); - DCI.AddToWorklist(Op.getNode()); + // Note that even with AVX we prefer the PSHUFD form of shuffle for integer + // vectors because it can have a load folded into it that UNPCK cannot. This + // doesn't preclude something switching to the shorter encoding post-RA. + if (FloatDomain && (Mask.equals(0, 0) || Mask.equals(1, 1))) { + bool Lo = Mask.equals(0, 0); + unsigned Shuffle; + MVT ShuffleVT; + // Check if we have SSE3 which will let us use MOVDDUP. That instruction + // is no slower than UNPCKLPD but has the option to fold the input operand + // into even an unaligned memory load. + if (Lo && Subtarget->hasSSE3()) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v2f64; + } else { + // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller + // than the UNPCK variants. + Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; + ShuffleVT = MVT::v4f32; + } + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + if (Shuffle == X86ISD::MOVDDUP) + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + else Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); - DCI.AddToWorklist(Op.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), - /*AddTo*/ true); - return true; - } + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + + // FIXME: We should match UNPCKLPS and UNPCKHPS here. + + // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK + // variants as none of these have single-instruction variants that are + // superior to the UNPCK formulation. + if (!FloatDomain && + (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || + Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, + 15))) { + bool Lo = Mask[0] == 0; + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + MVT ShuffleVT; + switch (Mask.size()) { + case 8: + ShuffleVT = MVT::v8i16; + break; + case 16: + ShuffleVT = MVT::v16i8; + break; + default: + llvm_unreachable("Impossible mask size!"); + }; + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; } // Don't try to re-form single instruction chains under any circumstances now diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll index ca54022..a8dae82 100644 --- a/llvm/test/CodeGen/X86/avx-basic.ll +++ b/llvm/test/CodeGen/X86/avx-basic.ll @@ -72,9 +72,9 @@ entry: ret <4 x i64> %shuffle } -; CHECK: vpunpcklqdq +; CHECK: vmovlhps ; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: vpunpcklqdq +; CHECK-NEXT: vmovlhps ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { entry: @@ -83,7 +83,7 @@ entry: } ; CHECK: vpshufd $-96 -; CHECK: vpunpckhdq +; CHECK: vpshufd $-6 ; CHECK: vinsertf128 $1 define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { entry: diff --git a/llvm/test/CodeGen/X86/avx-sext.ll b/llvm/test/CodeGen/X86/avx-sext.ll index 9bcf06f..fb2287f 100644 --- a/llvm/test/CodeGen/X86/avx-sext.ll +++ b/llvm/test/CodeGen/X86/avx-sext.ll @@ -156,7 +156,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; AVX-LABEL: sext_16i8_to_16i16 ; AVX: vpmovsxbw -; AVX: vpunpckhqdq +; AVX: vmovhlps ; AVX: vpmovsxbw ; AVX: ret define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll index a2537ce..058db31 100644 --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -19,7 +19,7 @@ entry: } ; CHECK: vmovq -; CHECK-NEXT: vpunpcklqdq %xmm +; CHECK-NEXT: vmovlhps %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { entry: diff --git a/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll b/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll index f4539c8..ab79495 100644 --- a/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll +++ b/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll @@ -95,8 +95,8 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> % ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg ; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with ; vpand and there is nothing more you can do to match vmaxpd. -; CHECK: vpunpcklqdq -; CHECK: vpand +; CHECK: vmovlhps +; CHECK: vandps ; CHECK: vmaxpd ; CHECK: ret define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) { diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 8faa3f0..f959fea 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -639,7 +639,7 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) { ; ALL-LABEL: @shuffle_v8i16_XXXdXXXX ; ALL: # BB#0: -; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm1[0,2,2,3] +; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,2,3,3] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index cd79a38..affa9337 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown" define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0001 ; AVX1: # BB#0: -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> @@ -18,7 +18,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> @@ -41,7 +41,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> @@ -52,7 +52,7 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_1000 ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> @@ -63,8 +63,8 @@ define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_2200 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> @@ -76,7 +76,7 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0] -; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm1 = xmm1[1,1] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> @@ -281,7 +281,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0124 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -292,7 +292,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0142 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -304,7 +304,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -316,7 +316,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -336,7 +336,7 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -356,7 +356,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1] -; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -368,8 +368,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @stress_test1 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1] -; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |