aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86InterleavedAccess.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86InterleavedAccess.cpp')
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp139
1 files changed, 132 insertions, 7 deletions
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index de3f672..6649308a 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -14,7 +14,6 @@
///
//===--------------------------------------------------------------------===//
-#include "X86ISelLowering.h"
#include "X86TargetMachine.h"
#include "llvm/Analysis/VectorUtils.h"
@@ -69,8 +68,9 @@ class X86InterleavedAccessGroup {
/// Out-V2 = p3, q3, r3, s3
/// Out-V3 = P4, q4, r4, s4
void transpose_4x4(ArrayRef<Instruction *> InputVectors,
- SmallVectorImpl<Value *> &TrasposedVectors);
-
+ SmallVectorImpl<Value *> &TransposedMatrix);
+ void interleave8bit_32x4(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix);
public:
/// In order to form an interleaved access group X86InterleavedAccessGroup
/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
@@ -101,18 +101,27 @@ bool X86InterleavedAccessGroup::isSupported() const {
Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
unsigned SupportedNumElem = 4;
+ if (ShuffleElemSize == 8)
+ SupportedNumElem = 32;
unsigned WideInstSize;
- // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
+ // Currently, lowering is supported for the following vectors:
+ // 1. 4-element vectors of 64 bits on AVX.
+ // 2. 32-element vectors of 8 bits on AVX.
if (isa<LoadInst>(Inst)) {
- if (DL.getTypeSizeInBits(ShuffleVecTy) != SupportedNumElem * ShuffleElemSize)
+ if (DL.getTypeSizeInBits(ShuffleVecTy) !=
+ SupportedNumElem * ShuffleElemSize)
return false;
WideInstSize = DL.getTypeSizeInBits(Inst->getType());
} else
WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
- if (!Subtarget.hasAVX() || Factor != 4 || ShuffleElemSize != 64 ||
+ if (DL.getTypeSizeInBits(ShuffleEltTy) == 8 && !isa<StoreInst>(Inst))
+ return false;
+
+ if (!Subtarget.hasAVX() || Factor != 4 ||
+ (ShuffleElemSize != 64 && ShuffleElemSize != 8) ||
WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem))
return false;
@@ -163,6 +172,113 @@ void X86InterleavedAccessGroup::decompose(
}
}
+// Create shuffle mask for concatenation of two half vectors.
+// Low = false: mask generated for the shuffle
+// shuffle(VEC1,VEC2,{NumElement/2, NumElement/2+1, NumElement/2+2...,
+// NumElement-1, NumElement+NumElement/2,
+// NumElement+NumElement/2+1..., 2*NumElement-1})
+// = concat(high_half(VEC1),high_half(VEC2))
+// Low = true: mask generated for the shuffle
+// shuffle(VEC1,VEC2,{0,1,2,...,NumElement/2-1,NumElement,
+// NumElement+1...,NumElement+NumElement/2-1})
+// = concat(low_half(VEC1),low_half(VEC2))
+static void createConcatShuffleMask(int NumElements,
+ SmallVectorImpl<uint32_t> &Mask, bool Low) {
+ int NumHalfElements = NumElements / 2;
+ int Offset = Low ? 0 : NumHalfElements;
+ for (int i = 0; i < NumHalfElements; ++i)
+ Mask.push_back(i + Offset);
+ for (int i = 0; i < NumHalfElements; ++i)
+ Mask.push_back(i + Offset + NumElements);
+}
+
+void X86InterleavedAccessGroup::interleave8bit_32x4(
+ ArrayRef<Instruction *> Matrix,
+ SmallVectorImpl<Value *> &TransposedMatrix) {
+
+ // Example: Assuming we start from the following vectors:
+ // Matrix[0]= c0 c1 c2 c3 c4 ... c31
+ // Matrix[1]= m0 m1 m2 m3 m4 ... m31
+ // Matrix[2]= y0 y1 y2 y3 y4 ... y31
+ // Matrix[3]= k0 k1 k2 k3 k4 ... k31
+
+ TransposedMatrix.resize(4);
+
+ SmallVector<uint32_t, 32> MaskHighTemp;
+ SmallVector<uint32_t, 32> MaskLowTemp;
+ SmallVector<uint32_t, 32> MaskHighTemp1;
+ SmallVector<uint32_t, 32> MaskLowTemp1;
+ SmallVector<uint32_t, 32> MaskHighTemp2;
+ SmallVector<uint32_t, 32> MaskLowTemp2;
+ SmallVector<uint32_t, 32> ConcatLow;
+ SmallVector<uint32_t, 32> ConcatHigh;
+
+ // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
+ // shuffle pattern.
+
+ createUnpackShuffleMask<uint32_t>(MVT::v32i8, MaskHighTemp, false, false);
+ createUnpackShuffleMask<uint32_t>(MVT::v32i8, MaskLowTemp, true, false);
+ ArrayRef<uint32_t> MaskHigh = makeArrayRef(MaskHighTemp);
+ ArrayRef<uint32_t> MaskLow = makeArrayRef(MaskLowTemp);
+
+ // ConcatHigh and ConcatLow built in the vperm2i128 and vinserti128 X86
+ // shuffle pattern.
+
+ createConcatShuffleMask(32, ConcatLow, true);
+ createConcatShuffleMask(32, ConcatHigh, false);
+ ArrayRef<uint32_t> MaskConcatLow = makeArrayRef(ConcatLow);
+ ArrayRef<uint32_t> MaskConcatHigh = makeArrayRef(ConcatHigh);
+
+ // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
+ // shuffle pattern.
+
+ createUnpackShuffleMask<uint32_t>(MVT::v16i16, MaskLowTemp1, true, false);
+ createUnpackShuffleMask<uint32_t>(MVT::v16i16, MaskHighTemp1, false, false);
+ scaleShuffleMask<uint32_t>(2, makeArrayRef(MaskHighTemp1), MaskHighTemp2);
+ scaleShuffleMask<uint32_t>(2, makeArrayRef(MaskLowTemp1), MaskLowTemp2);
+ ArrayRef<uint32_t> MaskHighWord = makeArrayRef(MaskHighTemp2);
+ ArrayRef<uint32_t> MaskLowWord = makeArrayRef(MaskLowTemp2);
+
+ // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
+ // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
+ // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
+ // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
+
+ Value *IntrVec1Low =
+ Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
+ Value *IntrVec1High =
+ Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
+ Value *IntrVec2Low =
+ Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
+ Value *IntrVec2High =
+ Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
+
+ // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
+ // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
+ // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
+ // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
+
+ Value *High =
+ Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
+ Value *High1 =
+ Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskHighWord);
+ Value *Low =
+ Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
+ Value *Low1 =
+ Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskLowWord);
+
+ // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
+ // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
+ // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
+ // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31
+
+ TransposedMatrix[0] = Builder.CreateShuffleVector(Low, High, MaskConcatLow);
+ TransposedMatrix[1] = Builder.CreateShuffleVector(Low1, High1, MaskConcatLow);
+ TransposedMatrix[2] = Builder.CreateShuffleVector(Low, High, MaskConcatHigh);
+ TransposedMatrix[3] =
+ Builder.CreateShuffleVector(Low1, High1, MaskConcatHigh);
+}
+
void X86InterleavedAccessGroup::transpose_4x4(
ArrayRef<Instruction *> Matrix,
SmallVectorImpl<Value *> &TransposedMatrix) {
@@ -229,7 +345,16 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// 2. Transpose the interleaved-vectors into vectors of contiguous
// elements.
- transpose_4x4(DecomposedVectors, TransposedVectors);
+ switch (NumSubVecElems) {
+ case 4:
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+ break;
+ case 32:
+ interleave8bit_32x4(DecomposedVectors, TransposedVectors);
+ break;
+ default:
+ return false;
+ }
// 3. Concatenate the contiguous-vectors back into a wide vector.
Value *WideVec = concatenateVectors(Builder, TransposedVectors);