[X86][AVX2] vpslldq/vpsrldq byte shifts for AVX2

This patch refactors the existing lowerVectorShuffleAsByteShift function to add support for 256-bit vectors on AVX2 targets. It also fixes a tablegen issue that prevented the lowering of vpslldq/vpsrldq vec256 instructions. Differential Revision: http://reviews.llvm.org/D7596 llvm-svn: 229311
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2015-02-15 13:19:52 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2015-02-15 13:19:52 +0000
commit: 00bd79d794b1fbfe5dc069180974c6c2089f2c13 (patch)
tree: a35fd1356c5153fb0b91d06a00ed7fcf0cd6f561 /llvm/lib
parent: 6a61efdce501f47f617e1fa7015392a4a35e5c7c (diff)
download: llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.zip
llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.tar.gz
llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.tar.bz2
2 files changed, 79 insertions, 62 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b63911b..e539c39 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7834,12 +7834,10 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
 
 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
 ///
-/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
+/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ
 /// byte-shift instructions. The mask must consist of a shifted sequential
 /// shuffle from one of the input vectors and zeroable elements for the
 /// remaining 'shifted in' elements.
-///
-/// Note that this only handles 128-bit vector widths currently.
 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
                                              SDValue V2, ArrayRef<int> Mask,
                                              SelectionDAG &DAG) {
@@ -7847,63 +7845,56 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
 
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
 
-  int Size = Mask.size();
-  int Scale = 16 / Size;
-
-  for (int Shift = 1; Shift < Size; Shift++) {
-    int ByteShift = Shift * Scale;
-
-    // PSRLDQ : (little-endian) right byte shift
-    // [ 5,  6,  7, zz, zz, zz, zz, zz]
-    // [ -1, 5,  6,  7, zz, zz, zz, zz]
-    // [  1, 2, -1, -1, -1, -1, zz, zz]
-    bool ZeroableRight = true;
-    for (int i = Size - Shift; i < Size; i++) {
-      ZeroableRight &= Zeroable[i];
-    }
-
-    if (ZeroableRight) {
-      bool ValidShiftRight1 =
-          isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
-      bool ValidShiftRight2 =
-          isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
-
-      if (ValidShiftRight1 || ValidShiftRight2) {
-        // Cast the inputs to v2i64 to match PSRLDQ.
-        SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
-        SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
-        SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
-                                      DAG.getConstant(ByteShift * 8, MVT::i8));
-        return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
-      }
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumLaneElts = NumElts / NumLanes;
+  int Scale = 16 / NumLaneElts;
+  MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes);
+
+  // PSLLDQ : (little-endian) left byte shift
+  // [ zz,  0,  1,  2,  3,  4,  5,  6]
+  // [ zz, zz, -1, -1,  2,  3,  4, -1]
+  // [ zz, zz, zz, zz, zz, zz, -1,  1]
+  // PSRLDQ : (little-endian) right byte shift
+  // [  5, 6,  7, zz, zz, zz, zz, zz]
+  // [ -1, 5,  6,  7, zz, zz, zz, zz]
+  // [  1, 2, -1, -1, -1, -1, zz, zz]
+  auto MatchByteShift = [&](int Shift) -> SDValue {
+    bool MatchLeft = true, MatchRight = true;
+    for (int l = 0; l < NumElts; l += NumLaneElts) {
+      for (int i = 0; i < Shift; ++i)
+        MatchLeft &= Zeroable[l + i];
+      for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i)
+        MatchRight &= Zeroable[l + i];
     }
+    if (!(MatchLeft || MatchRight))
+      return SDValue();
 
-    // PSLLDQ : (little-endian) left byte shift
-    // [ zz,  0,  1,  2,  3,  4,  5,  6]
-    // [ zz, zz, -1, -1,  2,  3,  4, -1]
-    // [ zz, zz, zz, zz, zz, zz, -1,  1]
-    bool ZeroableLeft = true;
-    for (int i = 0; i < Shift; i++) {
-      ZeroableLeft &= Zeroable[i];
-    }
-
-    if (ZeroableLeft) {
-      bool ValidShiftLeft1 =
-          isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
-      bool ValidShiftLeft2 =
-          isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
-
-      if (ValidShiftLeft1 || ValidShiftLeft2) {
-        // Cast the inputs to v2i64 to match PSLLDQ.
-        SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
-        SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
-        SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
-                                      DAG.getConstant(ByteShift * 8, MVT::i8));
-        return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
-      }
+    bool MatchV1 = true, MatchV2 = true;
+    for (int l = 0; l < NumElts; l += NumLaneElts) {
+      unsigned Pos = MatchLeft ? Shift + l : l;
+      unsigned Low = MatchLeft ? l : Shift + l;
+      unsigned Len = NumLaneElts - Shift;
+      MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
+      MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts);
     }
-  }
+    if (!(MatchV1 || MatchV2))
+      return SDValue();
+
+    int ByteShift = Shift * Scale;
+    unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ;
+    SDValue V = MatchV1 ? V1 : V2;
+    V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+    V = DAG.getNode(Op, DL, ShiftVT, V,
+                    DAG.getConstant(ByteShift * 8, MVT::i8));
+    return DAG.getNode(ISD::BITCAST, DL, VT, V);
+  };
 
+  for (int Shift = 1; Shift < NumLaneElts; ++Shift)
+    if (SDValue S = MatchByteShift(Shift))
+      return S;
+
+  // no match
   return SDValue();
 }
 
@@ -10674,12 +10665,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
     }
-
-    // Use dedicated unpack instructions for masks that match their pattern.
-    if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
-      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
-    if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
-      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
   }
 
   // AVX2 provides a direct instruction for permuting a single input across
@@ -10688,6 +10673,17 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
 
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v4i64, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
@@ -10863,6 +10859,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DL, MVT::v8i32, V1, V2, Mask, DAG))
     return Shift;
 
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i32, V1, V2, Mask, DAG))
+    return Shift;
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@@ -10951,6 +10952,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DL, MVT::v16i16, V1, V2, Mask, DAG))
     return Shift;
 
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v16i16, V1, V2, Mask, DAG))
+    return Shift;
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@@ -11034,6 +11040,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DL, MVT::v32i8, V1, V2, Mask, DAG))
     return Shift;
 
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v32i8, V1, V2, Mask, DAG))
+    return Shift;
+
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index d684976..dc54fc5 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4296,6 +4296,12 @@ let Predicates = [HasAVX2] in {
             (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
             (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))),
+            (VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
+  def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))),
+            (VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
 }
 
 let Predicates = [UseSSE2] in {
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2015-02-15 13:19:52 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2015-02-15 13:19:52 +0000
commit	00bd79d794b1fbfe5dc069180974c6c2089f2c13 (patch)
tree	a35fd1356c5153fb0b91d06a00ed7fcf0cd6f561 /llvm/lib
parent	6a61efdce501f47f617e1fa7015392a4a35e5c7c (diff)
download	llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.zip llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.tar.gz llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.tar.bz2