diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2015-02-15 13:19:52 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2015-02-15 13:19:52 +0000 |
commit | 00bd79d794b1fbfe5dc069180974c6c2089f2c13 (patch) | |
tree | a35fd1356c5153fb0b91d06a00ed7fcf0cd6f561 /llvm/lib | |
parent | 6a61efdce501f47f617e1fa7015392a4a35e5c7c (diff) | |
download | llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.zip llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.tar.gz llvm-00bd79d794b1fbfe5dc069180974c6c2089f2c13.tar.bz2 |
[X86][AVX2] vpslldq/vpsrldq byte shifts for AVX2
This patch refactors the existing lowerVectorShuffleAsByteShift function to add support for 256-bit vectors on AVX2 targets.
It also fixes a tablegen issue that prevented the lowering of vpslldq/vpsrldq vec256 instructions.
Differential Revision: http://reviews.llvm.org/D7596
llvm-svn: 229311
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 135 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 6 |
2 files changed, 79 insertions, 62 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b63911b..e539c39 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7834,12 +7834,10 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). /// -/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 +/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ /// byte-shift instructions. The mask must consist of a shifted sequential /// shuffle from one of the input vectors and zeroable elements for the /// remaining 'shifted in' elements. -/// -/// Note that this only handles 128-bit vector widths currently. static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { @@ -7847,63 +7845,56 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - int Size = Mask.size(); - int Scale = 16 / Size; - - for (int Shift = 1; Shift < Size; Shift++) { - int ByteShift = Shift * Scale; - - // PSRLDQ : (little-endian) right byte shift - // [ 5, 6, 7, zz, zz, zz, zz, zz] - // [ -1, 5, 6, 7, zz, zz, zz, zz] - // [ 1, 2, -1, -1, -1, -1, zz, zz] - bool ZeroableRight = true; - for (int i = Size - Shift; i < Size; i++) { - ZeroableRight &= Zeroable[i]; - } - - if (ZeroableRight) { - bool ValidShiftRight1 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); - bool ValidShiftRight2 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); - - if (ValidShiftRight1 || ValidShiftRight2) { - // Cast the inputs to v2i64 to match PSRLDQ. - SDValue &TargetV = ValidShiftRight1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + int Scale = 16 / NumLaneElts; + MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes); + + // PSLLDQ : (little-endian) left byte shift + // [ zz, 0, 1, 2, 3, 4, 5, 6] + // [ zz, zz, -1, -1, 2, 3, 4, -1] + // [ zz, zz, zz, zz, zz, zz, -1, 1] + // PSRLDQ : (little-endian) right byte shift + // [ 5, 6, 7, zz, zz, zz, zz, zz] + // [ -1, 5, 6, 7, zz, zz, zz, zz] + // [ 1, 2, -1, -1, -1, -1, zz, zz] + auto MatchByteShift = [&](int Shift) -> SDValue { + bool MatchLeft = true, MatchRight = true; + for (int l = 0; l < NumElts; l += NumLaneElts) { + for (int i = 0; i < Shift; ++i) + MatchLeft &= Zeroable[l + i]; + for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i) + MatchRight &= Zeroable[l + i]; } + if (!(MatchLeft || MatchRight)) + return SDValue(); - // PSLLDQ : (little-endian) left byte shift - // [ zz, 0, 1, 2, 3, 4, 5, 6] - // [ zz, zz, -1, -1, 2, 3, 4, -1] - // [ zz, zz, zz, zz, zz, zz, -1, 1] - bool ZeroableLeft = true; - for (int i = 0; i < Shift; i++) { - ZeroableLeft &= Zeroable[i]; - } - - if (ZeroableLeft) { - bool ValidShiftLeft1 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); - bool ValidShiftLeft2 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); - - if (ValidShiftLeft1 || ValidShiftLeft2) { - // Cast the inputs to v2i64 to match PSLLDQ. - SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + bool MatchV1 = true, MatchV2 = true; + for (int l = 0; l < NumElts; l += NumLaneElts) { + unsigned Pos = MatchLeft ? Shift + l : l; + unsigned Low = MatchLeft ? l : Shift + l; + unsigned Len = NumLaneElts - Shift; + MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low); + MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts); } - } + if (!(MatchV1 || MatchV2)) + return SDValue(); + + int ByteShift = Shift * Scale; + unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ; + SDValue V = MatchV1 ? V1 : V2; + V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); + V = DAG.getNode(Op, DL, ShiftVT, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + }; + for (int Shift = 1; Shift < NumLaneElts; ++Shift) + if (SDValue S = MatchByteShift(Shift)) + return S; + + // no match return SDValue(); } @@ -10674,12 +10665,6 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); } - - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); } // AVX2 provides a direct instruction for permuting a single input across @@ -10688,6 +10673,17 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i64, V1, V2, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single @@ -10863,6 +10859,11 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v8i32, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -10951,6 +10952,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v16i16, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -11034,6 +11040,11 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v32i8, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index d684976..dc54fc5 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -4296,6 +4296,12 @@ let Predicates = [HasAVX2] in { (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; + + // Shift up / down and insert zero's. + def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))), + (VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>; + def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))), + (VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>; } let Predicates = [UseSSE2] in { |