diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86InterleavedAccess.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86InterleavedAccess.cpp | 139 |
1 files changed, 132 insertions, 7 deletions
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index de3f672..6649308a 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -14,7 +14,6 @@ /// //===--------------------------------------------------------------------===// -#include "X86ISelLowering.h" #include "X86TargetMachine.h" #include "llvm/Analysis/VectorUtils.h" @@ -69,8 +68,9 @@ class X86InterleavedAccessGroup { /// Out-V2 = p3, q3, r3, s3 /// Out-V3 = P4, q4, r4, s4 void transpose_4x4(ArrayRef<Instruction *> InputVectors, - SmallVectorImpl<Value *> &TrasposedVectors); - + SmallVectorImpl<Value *> &TransposedMatrix); + void interleave8bit_32x4(ArrayRef<Instruction *> InputVectors, + SmallVectorImpl<Value *> &TransposedMatrix); public: /// In order to form an interleaved access group X86InterleavedAccessGroup /// requires a wide-load instruction \p 'I', a group of interleaved-vectors @@ -101,18 +101,27 @@ bool X86InterleavedAccessGroup::isSupported() const { Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); unsigned SupportedNumElem = 4; + if (ShuffleElemSize == 8) + SupportedNumElem = 32; unsigned WideInstSize; - // Currently, lowering is supported for 4-element vectors of 64 bits on AVX. + // Currently, lowering is supported for the following vectors: + // 1. 4-element vectors of 64 bits on AVX. + // 2. 32-element vectors of 8 bits on AVX. if (isa<LoadInst>(Inst)) { - if (DL.getTypeSizeInBits(ShuffleVecTy) != SupportedNumElem * ShuffleElemSize) + if (DL.getTypeSizeInBits(ShuffleVecTy) != + SupportedNumElem * ShuffleElemSize) return false; WideInstSize = DL.getTypeSizeInBits(Inst->getType()); } else WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); - if (!Subtarget.hasAVX() || Factor != 4 || ShuffleElemSize != 64 || + if (DL.getTypeSizeInBits(ShuffleEltTy) == 8 && !isa<StoreInst>(Inst)) + return false; + + if (!Subtarget.hasAVX() || Factor != 4 || + (ShuffleElemSize != 64 && ShuffleElemSize != 8) || WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem)) return false; @@ -163,6 +172,113 @@ void X86InterleavedAccessGroup::decompose( } } +// Create shuffle mask for concatenation of two half vectors. +// Low = false: mask generated for the shuffle +// shuffle(VEC1,VEC2,{NumElement/2, NumElement/2+1, NumElement/2+2..., +// NumElement-1, NumElement+NumElement/2, +// NumElement+NumElement/2+1..., 2*NumElement-1}) +// = concat(high_half(VEC1),high_half(VEC2)) +// Low = true: mask generated for the shuffle +// shuffle(VEC1,VEC2,{0,1,2,...,NumElement/2-1,NumElement, +// NumElement+1...,NumElement+NumElement/2-1}) +// = concat(low_half(VEC1),low_half(VEC2)) +static void createConcatShuffleMask(int NumElements, + SmallVectorImpl<uint32_t> &Mask, bool Low) { + int NumHalfElements = NumElements / 2; + int Offset = Low ? 0 : NumHalfElements; + for (int i = 0; i < NumHalfElements; ++i) + Mask.push_back(i + Offset); + for (int i = 0; i < NumHalfElements; ++i) + Mask.push_back(i + Offset + NumElements); +} + +void X86InterleavedAccessGroup::interleave8bit_32x4( + ArrayRef<Instruction *> Matrix, + SmallVectorImpl<Value *> &TransposedMatrix) { + + // Example: Assuming we start from the following vectors: + // Matrix[0]= c0 c1 c2 c3 c4 ... c31 + // Matrix[1]= m0 m1 m2 m3 m4 ... m31 + // Matrix[2]= y0 y1 y2 y3 y4 ... y31 + // Matrix[3]= k0 k1 k2 k3 k4 ... k31 + + TransposedMatrix.resize(4); + + SmallVector<uint32_t, 32> MaskHighTemp; + SmallVector<uint32_t, 32> MaskLowTemp; + SmallVector<uint32_t, 32> MaskHighTemp1; + SmallVector<uint32_t, 32> MaskLowTemp1; + SmallVector<uint32_t, 32> MaskHighTemp2; + SmallVector<uint32_t, 32> MaskLowTemp2; + SmallVector<uint32_t, 32> ConcatLow; + SmallVector<uint32_t, 32> ConcatHigh; + + // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86 + // shuffle pattern. + + createUnpackShuffleMask<uint32_t>(MVT::v32i8, MaskHighTemp, false, false); + createUnpackShuffleMask<uint32_t>(MVT::v32i8, MaskLowTemp, true, false); + ArrayRef<uint32_t> MaskHigh = makeArrayRef(MaskHighTemp); + ArrayRef<uint32_t> MaskLow = makeArrayRef(MaskLowTemp); + + // ConcatHigh and ConcatLow built in the vperm2i128 and vinserti128 X86 + // shuffle pattern. + + createConcatShuffleMask(32, ConcatLow, true); + createConcatShuffleMask(32, ConcatHigh, false); + ArrayRef<uint32_t> MaskConcatLow = makeArrayRef(ConcatLow); + ArrayRef<uint32_t> MaskConcatHigh = makeArrayRef(ConcatHigh); + + // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86 + // shuffle pattern. + + createUnpackShuffleMask<uint32_t>(MVT::v16i16, MaskLowTemp1, true, false); + createUnpackShuffleMask<uint32_t>(MVT::v16i16, MaskHighTemp1, false, false); + scaleShuffleMask<uint32_t>(2, makeArrayRef(MaskHighTemp1), MaskHighTemp2); + scaleShuffleMask<uint32_t>(2, makeArrayRef(MaskLowTemp1), MaskLowTemp2); + ArrayRef<uint32_t> MaskHighWord = makeArrayRef(MaskHighTemp2); + ArrayRef<uint32_t> MaskLowWord = makeArrayRef(MaskLowTemp2); + + // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23 + // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31 + // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23 + // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31 + + Value *IntrVec1Low = + Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); + Value *IntrVec1High = + Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh); + Value *IntrVec2Low = + Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); + Value *IntrVec2High = + Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh); + + // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23 + // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31 + // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19 + // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27 + + Value *High = + Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord); + Value *High1 = + Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskHighWord); + Value *Low = + Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord); + Value *Low1 = + Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskLowWord); + + // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7 + // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15 + // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23 + // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31 + + TransposedMatrix[0] = Builder.CreateShuffleVector(Low, High, MaskConcatLow); + TransposedMatrix[1] = Builder.CreateShuffleVector(Low1, High1, MaskConcatLow); + TransposedMatrix[2] = Builder.CreateShuffleVector(Low, High, MaskConcatHigh); + TransposedMatrix[3] = + Builder.CreateShuffleVector(Low1, High1, MaskConcatHigh); +} + void X86InterleavedAccessGroup::transpose_4x4( ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix) { @@ -229,7 +345,16 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // 2. Transpose the interleaved-vectors into vectors of contiguous // elements. - transpose_4x4(DecomposedVectors, TransposedVectors); + switch (NumSubVecElems) { + case 4: + transpose_4x4(DecomposedVectors, TransposedVectors); + break; + case 32: + interleave8bit_32x4(DecomposedVectors, TransposedVectors); + break; + default: + return false; + } // 3. Concatenate the contiguous-vectors back into a wide vector. Value *WideVec = concatenateVectors(Builder, TransposedVectors); |