diff options
Diffstat (limited to 'llvm/lib/CodeGen/InterleavedAccessPass.cpp')
-rw-r--r-- | llvm/lib/CodeGen/InterleavedAccessPass.cpp | 177 |
1 files changed, 169 insertions, 8 deletions
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index c6d5533..3f6a69e 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -60,6 +60,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -478,6 +479,157 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } +// For an (de)interleave tree like this: +// +// A C B D +// |___| |___| +// |_____| +// | +// A B C D +// +// We will get ABCD at the end while the leaf operands/results +// are ACBD, which are also what we initially collected in +// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI +// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need +// to reorder them by interleaving these values. +static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) { + unsigned NumLeaves = SubLeaves.size(); + if (NumLeaves == 2) + return; + + assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); + + const unsigned HalfLeaves = NumLeaves / 2; + // Visit the sub-trees. + interleaveLeafValues(SubLeaves.take_front(HalfLeaves)); + interleaveLeafValues(SubLeaves.drop_front(HalfLeaves)); + + SmallVector<Value *, 8> Buffer; + // a0 a1 a2 a3 b0 b1 b2 b3 + // -> a0 b0 a1 b1 a2 b2 a3 b3 + for (unsigned i = 0U; i < NumLeaves; ++i) + Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]); + + llvm::copy(Buffer, SubLeaves.begin()); +} + +static bool +getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands, + SmallVectorImpl<Instruction *> &DeadInsts) { + assert(II->getIntrinsicID() == Intrinsic::vector_interleave2); + + // Visit with BFS + SmallVector<IntrinsicInst *, 8> Queue; + Queue.push_back(II); + while (!Queue.empty()) { + IntrinsicInst *Current = Queue.front(); + Queue.erase(Queue.begin()); + + // All the intermediate intrinsics will be deleted. + DeadInsts.push_back(Current); + + for (unsigned I = 0; I < 2; ++I) { + Value *Op = Current->getOperand(I); + if (auto *OpII = dyn_cast<IntrinsicInst>(Op)) + if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) { + Queue.push_back(OpII); + continue; + } + + // If this is not a perfectly balanced tree, the leaf + // result types would be different. + if (!Operands.empty() && Op->getType() != Operands.back()->getType()) + return false; + + Operands.push_back(Op); + } + } + + const unsigned Factor = Operands.size(); + // Currently we only recognize power-of-two factors. + // FIXME: should we assert here instead? + if (Factor <= 1 || !isPowerOf2_32(Factor)) + return false; + + interleaveLeafValues(Operands); + return true; +} + +static bool +getVectorDeinterleaveFactor(IntrinsicInst *II, + SmallVectorImpl<Value *> &Results, + SmallVectorImpl<Instruction *> &DeadInsts) { + assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2); + using namespace PatternMatch; + if (!II->hasNUses(2)) + return false; + + // Visit with BFS + SmallVector<IntrinsicInst *, 8> Queue; + Queue.push_back(II); + while (!Queue.empty()) { + IntrinsicInst *Current = Queue.front(); + Queue.erase(Queue.begin()); + assert(Current->hasNUses(2)); + + // All the intermediate intrinsics will be deleted from the bottom-up. + DeadInsts.insert(DeadInsts.begin(), Current); + + ExtractValueInst *LHS = nullptr, *RHS = nullptr; + for (User *Usr : Current->users()) { + if (!isa<ExtractValueInst>(Usr)) + return 0; + + auto *EV = cast<ExtractValueInst>(Usr); + // Intermediate ExtractValue instructions will also be deleted. + DeadInsts.insert(DeadInsts.begin(), EV); + ArrayRef<unsigned> Indices = EV->getIndices(); + if (Indices.size() != 1) + return false; + + if (Indices[0] == 0 && !LHS) + LHS = EV; + else if (Indices[0] == 1 && !RHS) + RHS = EV; + else + return false; + } + + // We have legal indices. At this point we're either going + // to continue the traversal or push the leaf values into Results. + for (ExtractValueInst *EV : {LHS, RHS}) { + // Continue the traversal. We're playing safe here and matching only the + // expression consisting of a perfectly balanced binary tree in which all + // intermediate values are only used once. + if (EV->hasOneUse() && + match(EV->user_back(), + m_Intrinsic<Intrinsic::vector_deinterleave2>()) && + EV->user_back()->hasNUses(2)) { + auto *EVUsr = cast<IntrinsicInst>(EV->user_back()); + Queue.push_back(EVUsr); + continue; + } + + // If this is not a perfectly balanced tree, the leaf + // result types would be different. + if (!Results.empty() && EV->getType() != Results.back()->getType()) + return false; + + // Save the leaf value. + Results.push_back(EV); + } + } + + const unsigned Factor = Results.size(); + // Currently we only recognize power-of-two factors. + // FIXME: should we assert here instead? + if (Factor <= 1 || !isPowerOf2_32(Factor)) + return 0; + + interleaveLeafValues(Results); + return true; +} + bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) { LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0)); @@ -485,16 +637,21 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( if (!LI || !LI->hasOneUse() || !LI->isSimple()) return false; - LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n"); + SmallVector<Value *, 8> DeinterleaveValues; + SmallVector<Instruction *, 8> DeinterleaveDeadInsts; + if (!getVectorDeinterleaveFactor(DI, DeinterleaveValues, + DeinterleaveDeadInsts)) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI + << " with factor = " << DeinterleaveValues.size() << "\n"); // Try and match this with target specific intrinsics. - SmallVector<Instruction *, 4> DeinterleaveDeadInsts; - if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeinterleaveDeadInsts)) + if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues)) return false; DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end()); // We now have a target-specific load, so delete the old one. - DeadInsts.insert(DI); DeadInsts.insert(LI); return true; } @@ -509,16 +666,20 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( if (!SI || !SI->isSimple()) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n"); + SmallVector<Value *, 8> InterleaveValues; + SmallVector<Instruction *, 8> InterleaveDeadInsts; + if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts)) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II + << " with factor = " << InterleaveValues.size() << "\n"); - SmallVector<Instruction *, 4> InterleaveDeadInsts; // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts)) + if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) return false; // We now have a target-specific store, so delete the old one. DeadInsts.insert(SI); - DeadInsts.insert(II); DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end()); return true; } |