[PowerPC] Optimize VPERM and fix code order for swapping vector operands on LE

This patch reverts commit 7614ba0a5db8 to optimize VPERM when one of its vector operands is XXSWAPD, similar to XXPERM. It also reorganizes the little-endian swap code on LE, swapping the vector operand after adjusting the mask operand. This ensures that the vector operand is swapped at the correct point in the code, resulting in a valid constant pool for the mask operand. Reviewed By: stefanp Differential Revision: https://reviews.llvm.org/D149083
author: Maryam Moghadas <maryammo@ca.ibm.com> 2023-09-13 13:47:42 -0500
committer: Maryam Moghadas <maryammo@ca.ibm.com> 2023-09-13 15:00:49 -0500
commit: 7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79 (patch)
tree: 05ffedae5447f2faf7897e53af76f0e17a68a252
parent: d6d4a526f424f48b3cd15163287924bd6c93674b (diff)
download: llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.zip
llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.tar.gz
llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.tar.bz2
5 files changed, 93 insertions, 102 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 0ebdd77..95f2243 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10314,11 +10314,6 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
   bool isLittleEndian = Subtarget.isLittleEndian();
   bool isPPC64 = Subtarget.isPPC64();
 
-  // Only need to place items backwards in LE,
-  // the mask will be properly calculated.
-  if (isLittleEndian)
-    std::swap(V1, V2);
-
   if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
       (V1->hasOneUse() || V2->hasOneUse())) {
     LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
@@ -10328,7 +10323,8 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
     // The second input to XXPERM is also an output so if the second input has
     // multiple uses then copying is necessary, as a result we want the
     // single-use operand to be used as the second input to prevent copying.
-    if (!V2->hasOneUse() && V1->hasOneUse()) {
+    if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||
+        (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {
       std::swap(V1, V2);
       NeedSwap = !NeedSwap;
     }
@@ -10367,27 +10363,24 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
     unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
 
-    if (Opcode == PPCISD::XXPERM) {
-      if (V1HasXXSWAPD) {
-        if (SrcElt < 8)
-          SrcElt += 8;
-        else if (SrcElt < 16)
-          SrcElt -= 8;
-      }
-      if (V2HasXXSWAPD) {
-        if (SrcElt > 23)
-          SrcElt -= 8;
-        else if (SrcElt > 15)
-          SrcElt += 8;
-      }
-      if (NeedSwap) {
-        if (SrcElt < 16)
-          SrcElt += 16;
-        else
-          SrcElt -= 16;
-      }
+    if (V1HasXXSWAPD) {
+      if (SrcElt < 8)
+        SrcElt += 8;
+      else if (SrcElt < 16)
+        SrcElt -= 8;
+    }
+    if (V2HasXXSWAPD) {
+      if (SrcElt > 23)
+        SrcElt -= 8;
+      else if (SrcElt > 15)
+        SrcElt += 8;
+    }
+    if (NeedSwap) {
+      if (SrcElt < 16)
+        SrcElt += 16;
+      else
+        SrcElt -= 16;
     }
-
     for (unsigned j = 0; j != BytesPerElement; ++j)
       if (isLittleEndian)
         ResultMask.push_back(
@@ -10397,18 +10390,19 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
             DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32));
   }
 
-  if (Opcode == PPCISD::XXPERM && (V1HasXXSWAPD || V2HasXXSWAPD)) {
-    if (V1HasXXSWAPD) {
-      dl = SDLoc(V1->getOperand(0));
-      V1 = V1->getOperand(0)->getOperand(1);
-    }
-    if (V2HasXXSWAPD) {
-      dl = SDLoc(V2->getOperand(0));
-      V2 = V2->getOperand(0)->getOperand(1);
-    }
-    if (isPPC64 && ValType != MVT::v2f64)
+  if (V1HasXXSWAPD) {
+    dl = SDLoc(V1->getOperand(0));
+    V1 = V1->getOperand(0)->getOperand(1);
+  }
+  if (V2HasXXSWAPD) {
+    dl = SDLoc(V2->getOperand(0));
+    V2 = V2->getOperand(0)->getOperand(1);
+  }
+
+  if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {
+    if (ValType != MVT::v2f64)
       V1 = DAG.getBitcast(MVT::v2f64, V1);
-    if (isPPC64 && V2.getValueType() != MVT::v2f64)
+    if (V2.getValueType() != MVT::v2f64)
       V2 = DAG.getBitcast(MVT::v2f64, V2);
   }
 
@@ -10429,6 +10423,11 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
   if (Opcode == PPCISD::XXPERM)
     VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask);
 
+  // Only need to place items backwards in LE,
+  // the mask was properly calculated.
+  if (isLittleEndian)
+    std::swap(V1, V2);
+
   SDValue VPERMNode =
       DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask);
 
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index 6410738..f729018d 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1058,16 +1058,15 @@ define <4 x i32> @fromDiffMemVarDi(ptr nocapture readonly %arr, i32 signext %ele
 ;
 ; P8LE-LABEL: fromDiffMemVarDi:
 ; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    addis r5, r2, .LCPI9_0@toc@ha
 ; P8LE-NEXT:    sldi r4, r4, 2
+; P8LE-NEXT:    addi r5, r5, .LCPI9_0@toc@l
 ; P8LE-NEXT:    add r3, r3, r4
+; P8LE-NEXT:    lxvd2x vs0, 0, r5
 ; P8LE-NEXT:    addi r3, r3, -12
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI9_0@toc@l
+; P8LE-NEXT:    lxvd2x v3, 0, r3
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    xxswapd v3, vs0
-; P8LE-NEXT:    vperm v2, v2, v2, v3
+; P8LE-NEXT:    vperm v2, v3, v3, v2
 ; P8LE-NEXT:    blr
 entry:
   %idxprom = sext i32 %elem to i64
@@ -1478,13 +1477,12 @@ define <4 x i32> @fromDiffMemConsDConvftoi(ptr nocapture readonly %ptr) {
 ;
 ; P8LE-LABEL: fromDiffMemConsDConvftoi:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    addis r3, r2, .LCPI18_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI18_0@toc@l
+; P8LE-NEXT:    addis r4, r2, .LCPI18_0@toc@ha
+; P8LE-NEXT:    lxvd2x v3, 0, r3
+; P8LE-NEXT:    addi r4, r4, .LCPI18_0@toc@l
+; P8LE-NEXT:    lxvd2x vs0, 0, r4
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    xxswapd v3, vs0
-; P8LE-NEXT:    vperm v2, v2, v2, v3
+; P8LE-NEXT:    vperm v2, v3, v3, v2
 ; P8LE-NEXT:    xvcvspsxws v2, v2
 ; P8LE-NEXT:    blr
 entry:
@@ -2580,16 +2578,15 @@ define <4 x i32> @fromDiffMemVarDui(ptr nocapture readonly %arr, i32 signext %el
 ;
 ; P8LE-LABEL: fromDiffMemVarDui:
 ; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    addis r5, r2, .LCPI41_0@toc@ha
 ; P8LE-NEXT:    sldi r4, r4, 2
+; P8LE-NEXT:    addi r5, r5, .LCPI41_0@toc@l
 ; P8LE-NEXT:    add r3, r3, r4
+; P8LE-NEXT:    lxvd2x vs0, 0, r5
 ; P8LE-NEXT:    addi r3, r3, -12
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    addis r3, r2, .LCPI41_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI41_0@toc@l
+; P8LE-NEXT:    lxvd2x v3, 0, r3
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    xxswapd v3, vs0
-; P8LE-NEXT:    vperm v2, v2, v2, v3
+; P8LE-NEXT:    vperm v2, v3, v3, v2
 ; P8LE-NEXT:    blr
 entry:
   %idxprom = sext i32 %elem to i64
@@ -3000,13 +2997,12 @@ define <4 x i32> @fromDiffMemConsDConvftoui(ptr nocapture readonly %ptr) {
 ;
 ; P8LE-LABEL: fromDiffMemConsDConvftoui:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    addis r3, r2, .LCPI50_0@toc@ha
-; P8LE-NEXT:    addi r3, r3, .LCPI50_0@toc@l
+; P8LE-NEXT:    addis r4, r2, .LCPI50_0@toc@ha
+; P8LE-NEXT:    lxvd2x v3, 0, r3
+; P8LE-NEXT:    addi r4, r4, .LCPI50_0@toc@l
+; P8LE-NEXT:    lxvd2x vs0, 0, r4
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    lxvd2x vs0, 0, r3
-; P8LE-NEXT:    xxswapd v3, vs0
-; P8LE-NEXT:    vperm v2, v2, v2, v3
+; P8LE-NEXT:    vperm v2, v3, v3, v2
 ; P8LE-NEXT:    xvcvspuxws v2, v2
 ; P8LE-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
index 2569731..11cc8ab 100644
--- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll
@@ -183,14 +183,13 @@ entry:
 define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v16i8:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI2_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x v3, 0, r4
 ; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
-; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI2_0@toc@l
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI2_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
 ; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v3, v2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v16i8:
@@ -431,14 +430,13 @@ entry:
 define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8-LABEL: test_none_v8i16:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI5_0@toc@ha
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI5_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x v3, 0, r4
 ; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
-; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI5_0@toc@l
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI5_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
 ; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v3, v2
 ; CHECK-LE-P8-NEXT:    blr
 ;
 ; CHECK-LE-P9-LABEL: test_none_v8i16:
diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
index 37820af..201bc5b 100644
--- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll
@@ -469,19 +469,18 @@ entry:
 define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) {
 ; CHECK-LE-P8-LABEL: test_none_v2i64:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI4_0@toc@ha
-; CHECK-LE-P8-NEXT:    lxsdx v4, 0, r3
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI4_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxsdx v3, 0, r3
 ; CHECK-LE-P8-NEXT:    addis r3, r2, .LCPI4_1@toc@ha
-; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI4_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x v4, 0, r4
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI4_0@toc@l
 ; CHECK-LE-P8-NEXT:    addi r3, r3, .LCPI4_1@toc@l
-; CHECK-LE-P8-NEXT:    lxvd2x vs1, 0, r4
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
 ; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs1
-; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
+; CHECK-LE-P8-NEXT:    vperm v2, v3, v4, v2
 ; CHECK-LE-P8-NEXT:    xxlxor v4, v4, v4
+; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
 ; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
 ; CHECK-LE-P8-NEXT:    xxswapd vs0, v2
 ; CHECK-LE-P8-NEXT:    stxvd2x vs0, 0, r3
diff --git a/llvm/test/CodeGen/PowerPC/vperm-swap.ll b/llvm/test/CodeGen/PowerPC/vperm-swap.ll
index 0a3b5ae..1f97978 100644
--- a/llvm/test/CodeGen/PowerPC/vperm-swap.ll
+++ b/llvm/test/CodeGen/PowerPC/vperm-swap.ll
@@ -4,32 +4,31 @@
 
 define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
 ; CHECK-LE-P8: .LCPI0_0:
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   30                              # 0x1e
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   22                              # 0x16
 ; CHECK-LE-P8-NEXT: .byte   7                               # 0x7
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
-; CHECK-LE-P8-NEXT: .byte   31                              # 0x1f
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
+; CHECK-LE-P8-NEXT: .byte   23                              # 0x17
 ; CHECK-LE-P8-LABEL: test_none_v16i8:
 ; CHECK-LE-P8:       # %bb.0: # %entry
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    addis r4, r2, .LCPI0_0@toc@ha
+; CHECK-LE-P8-NEXT:    addis r5, r2, .LCPI0_0@toc@ha
+; CHECK-LE-P8-NEXT:    lxvd2x v3, 0, r4
 ; CHECK-LE-P8-NEXT:    mtvsrd v4, r3
-; CHECK-LE-P8-NEXT:    addi r4, r4, .LCPI0_0@toc@l
+; CHECK-LE-P8-NEXT:    addi r5, r5, .LCPI0_0@toc@l
+; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r5
 ; CHECK-LE-P8-NEXT:    xxswapd v2, vs0
-; CHECK-LE-P8-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-LE-P8-NEXT:    xxswapd v3, vs0
-; CHECK-LE-P8-NEXT:    vperm v2, v4, v2, v3
+; CHECK-LE-P8-NEXT:    vperm v2, v4, v3, v2
 ; CHECK-LE-P8-NEXT:    blr
 entry:
   %lhs = load <16 x i8>, ptr %b, align 4
author	Maryam Moghadas <maryammo@ca.ibm.com>	2023-09-13 13:47:42 -0500
committer	Maryam Moghadas <maryammo@ca.ibm.com>	2023-09-13 15:00:49 -0500
commit	7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79 (patch)
tree	05ffedae5447f2faf7897e53af76f0e17a68a252
parent	d6d4a526f424f48b3cd15163287924bd6c93674b (diff)
download	llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.zip llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.tar.gz llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.tar.bz2