diff options
Diffstat (limited to 'llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 117 |
1 files changed, 114 insertions, 3 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7883acc..ddcecc00 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -952,10 +952,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // promoted to f32. v2f16 is expanded to f16, which is then promoted // to f32. for (const auto &Op : - {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { + {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::f32, Legal); - setOperationAction(Op, MVT::f64, Legal); + // only div/rem/sqrt are legal for f64 + if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) { + setOperationAction(Op, MVT::f64, Legal); + } setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand); setOperationAction(Op, MVT::bf16, Promote); AddPromotedToType(Op, MVT::bf16, MVT::f32); @@ -2068,6 +2071,8 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 && + Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands"); return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32, {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)}); } @@ -5872,6 +5877,8 @@ static SDValue combineADDRSPACECAST(SDNode *N, // details: // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) { + assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands"); + if (Mode == NVPTX::PTXPrmtMode::NONE) return Selector; @@ -5903,6 +5910,8 @@ static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) { } static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) { + assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 && + Selector.getBitWidth() == 32 && "PRMT must have i32 operands"); // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} APInt BitField = B.concat(A); APInt SelectorVal = getPRMTSelector(Selector, Mode); @@ -6537,10 +6546,13 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, KnownBits BKnown = DAG.computeKnownBits(B, Depth); // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} + assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 && + "PRMT must have i32 operands"); + assert(Known.getBitWidth() == 32 && "PRMT must have i32 result"); KnownBits BitField = BKnown.concat(AKnown); APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode); - for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) { + for (unsigned I : llvm::seq(4)) { APInt Sel = SelectorVal.extractBits(4, I * 4); unsigned Idx = Sel.getLoBits(3).getZExtValue(); unsigned Sign = Sel.getHiBits(1).getZExtValue(); @@ -6564,3 +6576,102 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode( break; } } + +static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal, + const APInt &DemandedBits) { + APInt DemandedLHS = APInt(32, 0); + APInt DemandedRHS = APInt(32, 0); + + for (unsigned I : llvm::seq(4)) { + if (DemandedBits.extractBits(8, I * 8).isZero()) + continue; + + APInt Sel = SelectorVal.extractBits(4, I * 4); + unsigned Idx = Sel.getLoBits(3).getZExtValue(); + unsigned Sign = Sel.getHiBits(1).getZExtValue(); + + APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS; + unsigned ByteStart = (Idx % 4) * 8; + if (Sign) + Src.setBit(ByteStart + 7); + else + Src.setBits(ByteStart, ByteStart + 8); + } + + return {DemandedLHS, DemandedRHS}; +} + +// Replace undef with 0 as this is easier for other optimizations such as +// known bits. +static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG) { + if (!Op) + return SDValue(); + if (Op.isUndef()) + return DAG.getConstant(0, SDLoc(), MVT::i32); + return Op; +} + +static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, + const APInt &DemandedBits, + SelectionDAG &DAG, + const TargetLowering &TLI, + unsigned Depth) { + assert(PRMT.getOpcode() == NVPTXISD::PRMT); + SDValue Op0 = PRMT.getOperand(0); + SDValue Op1 = PRMT.getOperand(1); + auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2)); + if (!SelectorConst) + return SDValue(); + + unsigned Mode = PRMT.getConstantOperandVal(3); + const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode); + + // Try to simplify the PRMT to one of the inputs if the used bytes are all + // from the same input in the correct order. + const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8; + const unsigned SelBits = (4 - LeadingBytes) * 4; + if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits)) + return Op0; + if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits)) + return Op1; + + auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits); + + // Attempt to avoid multi-use ops if we don't need anything from them. + SDValue DemandedOp0 = + TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1); + SDValue DemandedOp1 = + TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1); + + DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG); + DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG); + if ((DemandedOp0 && DemandedOp0 != Op0) || + (DemandedOp1 && DemandedOp1 != Op1)) { + Op0 = DemandedOp0 ? DemandedOp0 : Op0; + Op1 = DemandedOp1 ? DemandedOp1 : Op1; + return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG); + } + + return SDValue(); +} + +bool NVPTXTargetLowering::SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const { + Known.resetAll(); + + switch (Op.getOpcode()) { + case NVPTXISD::PRMT: + if (SDValue Result = simplifyDemandedBitsForPRMT(Op, DemandedBits, TLO.DAG, + *this, Depth)) { + TLO.CombineTo(Op, Result); + return true; + } + break; + default: + break; + } + + computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth); + return false; +} |