aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2015-02-15 13:19:52 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2015-02-15 13:19:52 +0000
commit00bd79d794b1fbfe5dc069180974c6c2089f2c13 (patch)
treea35fd1356c5153fb0b91d06a00ed7fcf0cd6f561 /llvm/lib
parent6a61efdce501f47f617e1fa7015392a4a35e5c7c (diff)
downloadllvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.zip
llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.tar.gz
llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.tar.bz2
[X86][AVX2] vpslldq/vpsrldq byte shifts for AVX2
This patch refactors the existing lowerVectorShuffleAsByteShift function to add support for 256-bit vectors on AVX2 targets. It also fixes a tablegen issue that prevented the lowering of vpslldq/vpsrldq vec256 instructions. Differential Revision: http://reviews.llvm.org/D7596 llvm-svn: 229311
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp135
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td6
2 files changed, 79 insertions, 62 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b63911b..e539c39 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7834,12 +7834,10 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
///
-/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
+/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ
/// byte-shift instructions. The mask must consist of a shifted sequential
/// shuffle from one of the input vectors and zeroable elements for the
/// remaining 'shifted in' elements.
-///
-/// Note that this only handles 128-bit vector widths currently.
static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
@@ -7847,63 +7845,56 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
- int Size = Mask.size();
- int Scale = 16 / Size;
-
- for (int Shift = 1; Shift < Size; Shift++) {
- int ByteShift = Shift * Scale;
-
- // PSRLDQ : (little-endian) right byte shift
- // [ 5, 6, 7, zz, zz, zz, zz, zz]
- // [ -1, 5, 6, 7, zz, zz, zz, zz]
- // [ 1, 2, -1, -1, -1, -1, zz, zz]
- bool ZeroableRight = true;
- for (int i = Size - Shift; i < Size; i++) {
- ZeroableRight &= Zeroable[i];
- }
-
- if (ZeroableRight) {
- bool ValidShiftRight1 =
- isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
- bool ValidShiftRight2 =
- isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
-
- if (ValidShiftRight1 || ValidShiftRight2) {
- // Cast the inputs to v2i64 to match PSRLDQ.
- SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
- SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
- DAG.getConstant(ByteShift * 8, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
- }
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+ int Scale = 16 / NumLaneElts;
+ MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes);
+
+ // PSLLDQ : (little-endian) left byte shift
+ // [ zz, 0, 1, 2, 3, 4, 5, 6]
+ // [ zz, zz, -1, -1, 2, 3, 4, -1]
+ // [ zz, zz, zz, zz, zz, zz, -1, 1]
+ // PSRLDQ : (little-endian) right byte shift
+ // [ 5, 6, 7, zz, zz, zz, zz, zz]
+ // [ -1, 5, 6, 7, zz, zz, zz, zz]
+ // [ 1, 2, -1, -1, -1, -1, zz, zz]
+ auto MatchByteShift = [&](int Shift) -> SDValue {
+ bool MatchLeft = true, MatchRight = true;
+ for (int l = 0; l < NumElts; l += NumLaneElts) {
+ for (int i = 0; i < Shift; ++i)
+ MatchLeft &= Zeroable[l + i];
+ for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i)
+ MatchRight &= Zeroable[l + i];
}
+ if (!(MatchLeft || MatchRight))
+ return SDValue();
- // PSLLDQ : (little-endian) left byte shift
- // [ zz, 0, 1, 2, 3, 4, 5, 6]
- // [ zz, zz, -1, -1, 2, 3, 4, -1]
- // [ zz, zz, zz, zz, zz, zz, -1, 1]
- bool ZeroableLeft = true;
- for (int i = 0; i < Shift; i++) {
- ZeroableLeft &= Zeroable[i];
- }
-
- if (ZeroableLeft) {
- bool ValidShiftLeft1 =
- isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
- bool ValidShiftLeft2 =
- isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
-
- if (ValidShiftLeft1 || ValidShiftLeft2) {
- // Cast the inputs to v2i64 to match PSLLDQ.
- SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
- SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
- DAG.getConstant(ByteShift * 8, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
- }
+ bool MatchV1 = true, MatchV2 = true;
+ for (int l = 0; l < NumElts; l += NumLaneElts) {
+ unsigned Pos = MatchLeft ? Shift + l : l;
+ unsigned Low = MatchLeft ? l : Shift + l;
+ unsigned Len = NumLaneElts - Shift;
+ MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
+ MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts);
}
- }
+ if (!(MatchV1 || MatchV2))
+ return SDValue();
+
+ int ByteShift = Shift * Scale;
+ unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ;
+ SDValue V = MatchV1 ? V1 : V2;
+ V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+ V = DAG.getNode(Op, DL, ShiftVT, V,
+ DAG.getConstant(ByteShift * 8, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+ };
+ for (int Shift = 1; Shift < NumLaneElts; ++Shift)
+ if (SDValue S = MatchByteShift(Shift))
+ return S;
+
+ // no match
return SDValue();
}
@@ -10674,12 +10665,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
}
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
}
// AVX2 provides a direct instruction for permuting a single input across
@@ -10688,6 +10673,17 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v4i64, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
@@ -10863,6 +10859,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v8i32, V1, V2, Mask, DAG))
return Shift;
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@@ -10951,6 +10952,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v16i16, V1, V2, Mask, DAG))
return Shift;
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v16i16, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
@@ -11034,6 +11040,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v32i8, V1, V2, Mask, DAG))
return Shift;
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v32i8, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index d684976..dc54fc5 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4296,6 +4296,12 @@ let Predicates = [HasAVX2] in {
(VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
(VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
+
+ // Shift up / down and insert zero's.
+ def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))),
+ (VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
+ def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))),
+ (VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>;
}
let Predicates = [UseSSE2] in {