1 files changed, 169 insertions, 8 deletions
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index c6d5533..3f6a69e 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -60,6 +60,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -478,6 +479,157 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   return true;
 }
 
+// For an (de)interleave tree like this:
+//
+//   A   C B   D
+//   |___| |___|
+//     |_____|
+//        |
+//     A B C D
+//
+//  We will get ABCD at the end while the leaf operands/results
+//  are ACBD, which are also what we initially collected in
+//  getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
+//  hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
+//  to reorder them by interleaving these values.
+static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
+  unsigned NumLeaves = SubLeaves.size();
+  if (NumLeaves == 2)
+    return;
+
+  assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
+
+  const unsigned HalfLeaves = NumLeaves / 2;
+  // Visit the sub-trees.
+  interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
+  interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));
+
+  SmallVector<Value *, 8> Buffer;
+  //    a0 a1 a2 a3 b0 b1 b2 b3
+  // -> a0 b0 a1 b1 a2 b2 a3 b3
+  for (unsigned i = 0U; i < NumLeaves; ++i)
+    Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);
+
+  llvm::copy(Buffer, SubLeaves.begin());
+}
+
+static bool
+getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
+                          SmallVectorImpl<Instruction *> &DeadInsts) {
+  assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
+
+  // Visit with BFS
+  SmallVector<IntrinsicInst *, 8> Queue;
+  Queue.push_back(II);
+  while (!Queue.empty()) {
+    IntrinsicInst *Current = Queue.front();
+    Queue.erase(Queue.begin());
+
+    // All the intermediate intrinsics will be deleted.
+    DeadInsts.push_back(Current);
+
+    for (unsigned I = 0; I < 2; ++I) {
+      Value *Op = Current->getOperand(I);
+      if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
+        if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
+          Queue.push_back(OpII);
+          continue;
+        }
+
+      // If this is not a perfectly balanced tree, the leaf
+      // result types would be different.
+      if (!Operands.empty() && Op->getType() != Operands.back()->getType())
+        return false;
+
+      Operands.push_back(Op);
+    }
+  }
+
+  const unsigned Factor = Operands.size();
+  // Currently we only recognize power-of-two factors.
+  // FIXME: should we assert here instead?
+  if (Factor <= 1 || !isPowerOf2_32(Factor))
+    return false;
+
+  interleaveLeafValues(Operands);
+  return true;
+}
+
+static bool
+getVectorDeinterleaveFactor(IntrinsicInst *II,
+                            SmallVectorImpl<Value *> &Results,
+                            SmallVectorImpl<Instruction *> &DeadInsts) {
+  assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
+  using namespace PatternMatch;
+  if (!II->hasNUses(2))
+    return false;
+
+  // Visit with BFS
+  SmallVector<IntrinsicInst *, 8> Queue;
+  Queue.push_back(II);
+  while (!Queue.empty()) {
+    IntrinsicInst *Current = Queue.front();
+    Queue.erase(Queue.begin());
+    assert(Current->hasNUses(2));
+
+    // All the intermediate intrinsics will be deleted from the bottom-up.
+    DeadInsts.insert(DeadInsts.begin(), Current);
+
+    ExtractValueInst *LHS = nullptr, *RHS = nullptr;
+    for (User *Usr : Current->users()) {
+      if (!isa<ExtractValueInst>(Usr))
+        return 0;
+
+      auto *EV = cast<ExtractValueInst>(Usr);
+      // Intermediate ExtractValue instructions will also be deleted.
+      DeadInsts.insert(DeadInsts.begin(), EV);
+      ArrayRef<unsigned> Indices = EV->getIndices();
+      if (Indices.size() != 1)
+        return false;
+
+      if (Indices[0] == 0 && !LHS)
+        LHS = EV;
+      else if (Indices[0] == 1 && !RHS)
+        RHS = EV;
+      else
+        return false;
+    }
+
+    // We have legal indices. At this point we're either going
+    // to continue the traversal or push the leaf values into Results.
+    for (ExtractValueInst *EV : {LHS, RHS}) {
+      // Continue the traversal. We're playing safe here and matching only the
+      // expression consisting of a perfectly balanced binary tree in which all
+      // intermediate values are only used once.
+      if (EV->hasOneUse() &&
+          match(EV->user_back(),
+                m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
+          EV->user_back()->hasNUses(2)) {
+        auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
+        Queue.push_back(EVUsr);
+        continue;
+      }
+
+      // If this is not a perfectly balanced tree, the leaf
+      // result types would be different.
+      if (!Results.empty() && EV->getType() != Results.back()->getType())
+        return false;
+
+      // Save the leaf value.
+      Results.push_back(EV);
+    }
+  }
+
+  const unsigned Factor = Results.size();
+  // Currently we only recognize power-of-two factors.
+  // FIXME: should we assert here instead?
+  if (Factor <= 1 || !isPowerOf2_32(Factor))
+    return 0;
+
+  interleaveLeafValues(Results);
+  return true;
+}
+
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
     IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
   LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
@@ -485,16 +637,21 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   if (!LI || !LI->hasOneUse() || !LI->isSimple())
     return false;
 
-  LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+  SmallVector<Value *, 8> DeinterleaveValues;
+  SmallVector<Instruction *, 8> DeinterleaveDeadInsts;
+  if (!getVectorDeinterleaveFactor(DI, DeinterleaveValues,
+                                   DeinterleaveDeadInsts))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI
+                    << " with factor = " << DeinterleaveValues.size() << "\n");
 
   // Try and match this with target specific intrinsics.
-  SmallVector<Instruction *, 4> DeinterleaveDeadInsts;
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeinterleaveDeadInsts))
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
     return false;
 
   DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end());
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.insert(DI);
   DeadInsts.insert(LI);
   return true;
 }
@@ -509,16 +666,20 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
   if (!SI || !SI->isSimple())
     return false;
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
+  SmallVector<Value *, 8> InterleaveValues;
+  SmallVector<Instruction *, 8> InterleaveDeadInsts;
+  if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II
+                    << " with factor = " << InterleaveValues.size() << "\n");
 
-  SmallVector<Instruction *, 4> InterleaveDeadInsts;
   // Try and match this with target specific intrinsics.
-  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts))
+  if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
     return false;
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.insert(SI);
-  DeadInsts.insert(II);
   DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end());
   return true;
 }