[PowerPC] Fix computation of offset for load-and-splat for permuted loads

Unfortunately this is another regression from my canonicalization patch (1fed131660b2). The patch contained two implicit assumptions: 1. That we would have a permuted load only if we are loading a partial vector 2. That a partial vector load would necessarily be as wide as the splat However, assumption 2 is not correct since it is possible to do a wider load and only splat a half of it. This patch corrects this assumption by simply checking if the load is permuted and adjusting the offset if it is. (cherry picked from commit 7d076e19e31a2a32e357cbdcf0183f88fe1fb0fb)
author: Nemanja Ivanovic <nemanja.i.ibm@gmail.com> 2020-07-24 15:38:27 -0400
committer: Hans Wennborg <hans@chromium.org> 2020-07-27 16:25:51 +0200
commit: ca49a47b8f87fb942be7c043da2000375346d8b4 (patch)
tree: 4ffc0c53c565c395bdb9957d4abf33057df4de29
parent: 152c2b1befb1879e690f228d95bbfe1bd381a1da (diff)
download: llvm-ca49a47b8f87fb942be7c043da2000375346d8b4.zip
llvm-ca49a47b8f87fb942be7c043da2000375346d8b4.tar.gz
llvm-ca49a47b8f87fb942be7c043da2000375346d8b4.tar.bz2
2 files changed, 106 insertions, 8 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1145484..980b5ea 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9111,13 +9111,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
                      Op0.getOperand(1));
 }
 
-static const SDValue *getNormalLoadInput(const SDValue &Op) {
+static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
   const SDValue *InputLoad = &Op;
   if (InputLoad->getOpcode() == ISD::BITCAST)
     InputLoad = &InputLoad->getOperand(0);
   if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
-      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
+      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
+    IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
     InputLoad = &InputLoad->getOperand(0);
+  }
   if (InputLoad->getOpcode() != ISD::LOAD)
     return nullptr;
   LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
@@ -9289,7 +9291,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
   if (!BVNIsConstantSplat || SplatBitSize > 32) {
 
-    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
+    bool IsPermutedLoad = false;
+    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
     // Handle load-and-splat patterns as we have instructions that will do this
     // in one go.
     if (InputLoad && DAG.isSplatValue(Op, true)) {
@@ -9912,7 +9915,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // If this is a load-and-splat, we can do that with a single instruction
   // in some cases. However if the load has multiple uses, we don't want to
   // combine it because that will just produce multiple loads.
-  const SDValue *InputLoad = getNormalLoadInput(V1);
+  bool IsPermutedLoad = false;
+  const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
   if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
       (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
       InputLoad->hasOneUse()) {
@@ -9920,6 +9924,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     int SplatIdx =
       PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
 
+    // The splat index for permuted loads will be in the left half of the vector
+    // which is strictly wider than the loaded value by 8 bytes. So we need to
+    // adjust the splat index to point to the correct address in memory.
+    if (IsPermutedLoad) {
+      assert(isLittleEndian && "Unexpected permuted load on big endian target");
+      SplatIdx += IsFourByte ? 2 : 1;
+      assert(SplatIdx < IsFourByte ? 4 : 2 &&
+             "Splat of a value outside of the loaded memory");
+    }
+
     LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
     // For 4-byte load-and-splat, we need Power9.
     if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
@@ -9929,10 +9943,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       else
         Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
 
-      // If we are loading a partial vector, it does not make sense to adjust
-      // the base pointer. This happens with (splat (s_to_v_permuted (ld))).
-      if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
-        Offset = 0;
       SDValue BasePtr = LD->getBasePtr();
       if (Offset != 0)
         BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index cc349ec..58984d3 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -446,5 +446,93 @@ entry:
   ret <16 x i8> %shuffle
 }
 
+define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat4Low:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    ld r3, 0(r3)
+; CHECK-P8-NEXT:    mtfprd f0, r3
+; CHECK-P8-NEXT:    xxspltw v2, vs0, 0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat4Low:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addi r3, r3, 4
+; CHECK-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat4Low:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addi r4, r1, -16
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    vspltw v2, v2, 2
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = bitcast <16 x i8> %vecinit18 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat4hi:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    ld r3, 0(r3)
+; CHECK-P8-NEXT:    mtfprd f0, r3
+; CHECK-P8-NEXT:    xxspltw v2, vs0, 1
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat4hi:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat4hi:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addi r4, r1, -16
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    vspltw v2, v2, 3
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = bitcast <16 x i8> %vecinit22 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lxvdsx v2, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxvdsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat8:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addis r4, r2, .LCPI19_0@toc@ha
+; CHECK-NOVSX-NEXT:    addi r4, r4, .LCPI19_0@toc@l
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    addi r3, r1, -16
+; CHECK-NOVSX-NEXT:    lvx v3, 0, r3
+; CHECK-NOVSX-NEXT:    vperm v2, v3, v3, v2
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <16 x i8> %vecinit30 to <2 x i64>
+  ret <2 x i64> %1
+}
+
 declare double @dummy() local_unnamed_addr
 attributes #0 = { nounwind }
author	Nemanja Ivanovic <nemanja.i.ibm@gmail.com>	2020-07-24 15:38:27 -0400
committer	Hans Wennborg <hans@chromium.org>	2020-07-27 16:25:51 +0200
commit	ca49a47b8f87fb942be7c043da2000375346d8b4 (patch)
tree	4ffc0c53c565c395bdb9957d4abf33057df4de29
parent	152c2b1befb1879e690f228d95bbfe1bd381a1da (diff)
download	llvm-ca49a47b8f87fb942be7c043da2000375346d8b4.zip llvm-ca49a47b8f87fb942be7c043da2000375346d8b4.tar.gz llvm-ca49a47b8f87fb942be7c043da2000375346d8b4.tar.bz2