diff options
author | Maryam Moghadas <maryammo@ca.ibm.com> | 2023-09-13 13:47:42 -0500 |
---|---|---|
committer | Maryam Moghadas <maryammo@ca.ibm.com> | 2023-09-13 15:00:49 -0500 |
commit | 7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79 (patch) | |
tree | 05ffedae5447f2faf7897e53af76f0e17a68a252 | |
parent | d6d4a526f424f48b3cd15163287924bd6c93674b (diff) | |
download | llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.zip llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.tar.gz llvm-7b021f2e64e1f5f7bdb4a7d72be12ad46e26cf79.tar.bz2 |
[PowerPC] Optimize VPERM and fix code order for swapping vector operands on LE
This patch reverts commit 7614ba0a5db8 to optimize VPERM when one of its
vector operands is XXSWAPD, similar to XXPERM. It also reorganizes the
little-endian swap code on LE, swapping the vector operand after
adjusting the mask operand. This ensures that the vector operand is
swapped at the correct point in the code, resulting in a valid
constant pool for the mask operand.
Reviewed By: stefanp
Differential Revision: https://reviews.llvm.org/D149083
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 73 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/build-vector-tests.ll | 44 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vperm-swap.ll | 41 |
5 files changed, 93 insertions, 102 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 0ebdd77..95f2243 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10314,11 +10314,6 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG, bool isLittleEndian = Subtarget.isLittleEndian(); bool isPPC64 = Subtarget.isPPC64(); - // Only need to place items backwards in LE, - // the mask will be properly calculated. - if (isLittleEndian) - std::swap(V1, V2); - if (Subtarget.hasVSX() && Subtarget.hasP9Vector() && (V1->hasOneUse() || V2->hasOneUse())) { LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using " @@ -10328,7 +10323,8 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG, // The second input to XXPERM is also an output so if the second input has // multiple uses then copying is necessary, as a result we want the // single-use operand to be used as the second input to prevent copying. - if (!V2->hasOneUse() && V1->hasOneUse()) { + if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) || + (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) { std::swap(V1, V2); NeedSwap = !NeedSwap; } @@ -10367,27 +10363,24 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; - if (Opcode == PPCISD::XXPERM) { - if (V1HasXXSWAPD) { - if (SrcElt < 8) - SrcElt += 8; - else if (SrcElt < 16) - SrcElt -= 8; - } - if (V2HasXXSWAPD) { - if (SrcElt > 23) - SrcElt -= 8; - else if (SrcElt > 15) - SrcElt += 8; - } - if (NeedSwap) { - if (SrcElt < 16) - SrcElt += 16; - else - SrcElt -= 16; - } + if (V1HasXXSWAPD) { + if (SrcElt < 8) + SrcElt += 8; + else if (SrcElt < 16) + SrcElt -= 8; + } + if (V2HasXXSWAPD) { + if (SrcElt > 23) + SrcElt -= 8; + else if (SrcElt > 15) + SrcElt += 8; + } + if (NeedSwap) { + if (SrcElt < 16) + SrcElt += 16; + else + SrcElt -= 16; } - for (unsigned j = 0; j != BytesPerElement; ++j) if (isLittleEndian) ResultMask.push_back( @@ -10397,18 +10390,19 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG, DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32)); } - if (Opcode == PPCISD::XXPERM && (V1HasXXSWAPD || V2HasXXSWAPD)) { - if (V1HasXXSWAPD) { - dl = SDLoc(V1->getOperand(0)); - V1 = V1->getOperand(0)->getOperand(1); - } - if (V2HasXXSWAPD) { - dl = SDLoc(V2->getOperand(0)); - V2 = V2->getOperand(0)->getOperand(1); - } - if (isPPC64 && ValType != MVT::v2f64) + if (V1HasXXSWAPD) { + dl = SDLoc(V1->getOperand(0)); + V1 = V1->getOperand(0)->getOperand(1); + } + if (V2HasXXSWAPD) { + dl = SDLoc(V2->getOperand(0)); + V2 = V2->getOperand(0)->getOperand(1); + } + + if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) { + if (ValType != MVT::v2f64) V1 = DAG.getBitcast(MVT::v2f64, V1); - if (isPPC64 && V2.getValueType() != MVT::v2f64) + if (V2.getValueType() != MVT::v2f64) V2 = DAG.getBitcast(MVT::v2f64, V2); } @@ -10429,6 +10423,11 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG, if (Opcode == PPCISD::XXPERM) VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask); + // Only need to place items backwards in LE, + // the mask was properly calculated. + if (isLittleEndian) + std::swap(V1, V2); + SDValue VPERMNode = DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask); diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll index 6410738..f729018d 100644 --- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll +++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll @@ -1058,16 +1058,15 @@ define <4 x i32> @fromDiffMemVarDi(ptr nocapture readonly %arr, i32 signext %ele ; ; P8LE-LABEL: fromDiffMemVarDi: ; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addis r5, r2, .LCPI9_0@toc@ha ; P8LE-NEXT: sldi r4, r4, 2 +; P8LE-NEXT: addi r5, r5, .LCPI9_0@toc@l ; P8LE-NEXT: add r3, r3, r4 +; P8LE-NEXT: lxvd2x vs0, 0, r5 ; P8LE-NEXT: addi r3, r3, -12 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r3, r2, .LCPI9_0@toc@ha -; P8LE-NEXT: addi r3, r3, .LCPI9_0@toc@l +; P8LE-NEXT: lxvd2x v3, 0, r3 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v2, v2, v3 +; P8LE-NEXT: vperm v2, v3, v3, v2 ; P8LE-NEXT: blr entry: %idxprom = sext i32 %elem to i64 @@ -1478,13 +1477,12 @@ define <4 x i32> @fromDiffMemConsDConvftoi(ptr nocapture readonly %ptr) { ; ; P8LE-LABEL: fromDiffMemConsDConvftoi: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r3, r2, .LCPI18_0@toc@ha -; P8LE-NEXT: addi r3, r3, .LCPI18_0@toc@l +; P8LE-NEXT: addis r4, r2, .LCPI18_0@toc@ha +; P8LE-NEXT: lxvd2x v3, 0, r3 +; P8LE-NEXT: addi r4, r4, .LCPI18_0@toc@l +; P8LE-NEXT: lxvd2x vs0, 0, r4 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v2, v2, v3 +; P8LE-NEXT: vperm v2, v3, v3, v2 ; P8LE-NEXT: xvcvspsxws v2, v2 ; P8LE-NEXT: blr entry: @@ -2580,16 +2578,15 @@ define <4 x i32> @fromDiffMemVarDui(ptr nocapture readonly %arr, i32 signext %el ; ; P8LE-LABEL: fromDiffMemVarDui: ; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addis r5, r2, .LCPI41_0@toc@ha ; P8LE-NEXT: sldi r4, r4, 2 +; P8LE-NEXT: addi r5, r5, .LCPI41_0@toc@l ; P8LE-NEXT: add r3, r3, r4 +; P8LE-NEXT: lxvd2x vs0, 0, r5 ; P8LE-NEXT: addi r3, r3, -12 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r3, r2, .LCPI41_0@toc@ha -; P8LE-NEXT: addi r3, r3, .LCPI41_0@toc@l +; P8LE-NEXT: lxvd2x v3, 0, r3 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v2, v2, v3 +; P8LE-NEXT: vperm v2, v3, v3, v2 ; P8LE-NEXT: blr entry: %idxprom = sext i32 %elem to i64 @@ -3000,13 +2997,12 @@ define <4 x i32> @fromDiffMemConsDConvftoui(ptr nocapture readonly %ptr) { ; ; P8LE-LABEL: fromDiffMemConsDConvftoui: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r3, r2, .LCPI50_0@toc@ha -; P8LE-NEXT: addi r3, r3, .LCPI50_0@toc@l +; P8LE-NEXT: addis r4, r2, .LCPI50_0@toc@ha +; P8LE-NEXT: lxvd2x v3, 0, r3 +; P8LE-NEXT: addi r4, r4, .LCPI50_0@toc@l +; P8LE-NEXT: lxvd2x vs0, 0, r4 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v2, v2, v3 +; P8LE-NEXT: vperm v2, v3, v3, v2 ; P8LE-NEXT: xvcvspuxws v2, v2 ; P8LE-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll index 2569731..11cc8ab 100644 --- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll @@ -183,14 +183,13 @@ entry: define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_none_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI2_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4 ; CHECK-LE-P8-NEXT: mtvsrd v4, r3 -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI2_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v16i8: @@ -431,14 +430,13 @@ entry: define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_none_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI5_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4 ; CHECK-LE-P8-NEXT: mtvsrd v4, r3 -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI5_0@toc@l +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI5_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v8i16: diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll index 37820af..201bc5b 100644 --- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll @@ -469,19 +469,18 @@ entry: define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) { ; CHECK-LE-P8-LABEL: test_none_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI4_0@toc@ha -; CHECK-LE-P8-NEXT: lxsdx v4, 0, r3 +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r3 ; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI4_1@toc@ha -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI4_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x v4, 0, r4 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l ; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI4_1@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r4 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs1 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v3, v4, v2 ; CHECK-LE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 ; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 diff --git a/llvm/test/CodeGen/PowerPC/vperm-swap.ll b/llvm/test/CodeGen/PowerPC/vperm-swap.ll index 0a3b5ae..1f97978 100644 --- a/llvm/test/CodeGen/PowerPC/vperm-swap.ll +++ b/llvm/test/CodeGen/PowerPC/vperm-swap.ll @@ -4,32 +4,31 @@ define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8: .LCPI0_0: -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 30 # 0x1e +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 22 # 0x16 ; CHECK-LE-P8-NEXT: .byte 7 # 0x7 -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 ; CHECK-LE-P8-LABEL: test_none_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4 ; CHECK-LE-P8-NEXT: mtvsrd v4, r3 -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI0_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-LE-P8-NEXT: blr entry: %lhs = load <16 x i8>, ptr %b, align 4 |