diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 273 |
1 files changed, 211 insertions, 62 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 3412bb5..39b4200 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1134,15 +1134,26 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; unsigned Opc; + bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1); if (Subtarget->hasMADIntraFwdBug()) Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 : AMDGPU::V_MAD_U64_U32_gfx11_e64; + else if (UseNoCarry) + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; else Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp }; + + if (UseNoCarry) { + MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops); + ReplaceUses(SDValue(N, 0), SDValue(Mad, 0)); + CurDAG->RemoveDeadNode(N); + return; + } + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } @@ -1863,15 +1874,6 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr, SIInstrFlags::FlatScratch); } -// If this matches zero_extend i32:x, return x -static SDValue matchZExtFromI32(SDValue Op) { - if (Op.getOpcode() != ISD::ZERO_EXTEND) - return SDValue(); - - SDValue ExtSrc = Op.getOperand(0); - return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); -} - // If this matches *_extend i32:x, return x // Otherwise if the value is I32 returns x. static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, @@ -1890,12 +1892,13 @@ static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, } // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) -bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, - SDValue Addr, - SDValue &SAddr, - SDValue &VOffset, - SDValue &Offset) const { +// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, bool &ScaleOffset, + bool NeedIOffset) const { int64_t ImmOffset = 0; + ScaleOffset = false; // Match the immediate offset first, which canonically is moved as low as // possible. @@ -1905,7 +1908,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, + if (NeedIOffset && + TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal)) { Addr = LHS; ImmOffset = COffsetVal; @@ -1915,11 +1919,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // saddr + large_offset -> saddr + // (voffset = large_offset & ~MaxOffset) + // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( - COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal; + if (NeedIOffset) { + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + } - if (isUInt<32>(RemainderOffset)) { + if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset) + : isUInt<32>(RemainderOffset)) { SDNode *VMov = CurDAG->getMachineNode( AMDGPU::V_MOV_B32_e32, SL, MVT::i32, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); @@ -1946,21 +1953,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, // Match the variable offset. if (Addr.getOpcode() == ISD::ADD) { LHS = Addr.getOperand(0); - RHS = Addr.getOperand(1); if (!LHS->isDivergent()) { - // add (i64 sgpr), (zero_extend (i32 vgpr)) - if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + // add (i64 sgpr), (*_extend (i32 vgpr)) + RHS = Addr.getOperand(1); + ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtRHS = matchExtFromI32orI32( + RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { SAddr = LHS; - VOffset = ZextRHS; + VOffset = ExtRHS; } } + RHS = Addr.getOperand(1); if (!SAddr && !RHS->isDivergent()) { - // add (zero_extend (i32 vgpr)), (i64 sgpr) - if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + // add (*_extend (i32 vgpr)), (i64 sgpr) + ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset()); + if (SDValue ExtLHS = matchExtFromI32orI32( + LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) { SAddr = RHS; - VOffset = ZextLHS; + VOffset = ExtLHS; } } @@ -1970,6 +1982,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, } } + if (Subtarget->hasScaleOffset() && + (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset() + ? AMDGPUISD::MAD_I64_I32 + : AMDGPUISD::MAD_U64_U32) || + (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 && + CurDAG->SignBitIsZero(Addr.getOperand(0)))) && + Addr.getOperand(0)->isDivergent() && + isa<ConstantSDNode>(Addr.getOperand(1)) && + !Addr.getOperand(2)->isDivergent()) { + // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr) + unsigned Size = + (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8; + ScaleOffset = Addr.getConstantOperandVal(1) == Size; + if (ScaleOffset) { + SAddr = Addr.getOperand(2); + VOffset = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); + return true; + } + } + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Addr)) return false; @@ -1989,10 +2022,28 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const { - if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) return false; - CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); return true; } @@ -2000,14 +2051,33 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset, SDValue &CPol) const { - if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + bool ScaleOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset)) return false; - unsigned CPolVal = AMDGPU::CPol::GLC; + unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC; CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32); return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &CPol) const { + bool ScaleOffset; + SDValue DummyOffset; + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset, + false)) + return false; + + // We are assuming CPol is always the last operand of the intrinsic. + auto PassedCPol = + N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL; + CPol = CurDAG->getTargetConstant( + (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); @@ -2091,7 +2161,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, - SDValue &Offset) const { + SDValue &Offset, + SDValue &CPol) const { int64_t ImmOffset = 0; SDValue LHS, RHS; @@ -2123,6 +2194,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); + CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); return true; } } @@ -2156,6 +2228,10 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; SAddr = SelectSAddrFI(CurDAG, SAddr); Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); + + bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */); + CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0, + SDLoc(), MVT::i32); return true; } @@ -3830,58 +3906,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +// Match lowered fpext from bf16 to f32. This is a bit operation extending +// a 16-bit value with 16-bit of zeroes at LSB: +// +// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val))))) +// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true +// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false +static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) { + if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST) + return SDValue(); + Op = Op.getOperand(0); + + IsExtractHigh = false; + if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) { + auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0)); + if (!Low16 || !Low16->isZero()) + return SDValue(); + Op = stripBitcast(Op.getOperand(1)); + if (Op.getValueType() != MVT::bf16) + return SDValue(); + return Op; + } + + if (Op.getValueType() != MVT::i32) + return SDValue(); + + if (Op.getOpcode() == ISD::AND) { + if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Mask->getZExtValue() == 0xffff0000) { + IsExtractHigh = true; + return Op.getOperand(0); + } + } + return SDValue(); + } + + if (Op.getOpcode() == ISD::SHL) { + if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (Amt->getZExtValue() == 16) + return Op.getOperand(0); + } + } + + return SDValue(); +} + // The return value is not whether the match is possible (which it always is), // but whether or not it a conversion is really used. bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + MVT VT) const { Mods = 0; SelectVOP3ModsImpl(In, Src, Mods); + bool IsExtractHigh = false; if (Src.getOpcode() == ISD::FP_EXTEND) { Src = Src.getOperand(0); - assert(Src.getValueType() == MVT::f16); - Src = stripBitcast(Src); + } else if (VT == MVT::bf16) { + SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh); + if (!B16) + return false; + Src = B16; + } else + return false; - // Be careful about folding modifiers if we already have an abs. fneg is - // applied last, so we don't want to apply an earlier fneg. - if ((Mods & SISrcMods::ABS) == 0) { - unsigned ModsTmp; - SelectVOP3ModsImpl(Src, Src, ModsTmp); + if (Src.getValueType() != VT && + (VT != MVT::bf16 || Src.getValueType() != MVT::i32)) + return false; - if ((ModsTmp & SISrcMods::NEG) != 0) - Mods ^= SISrcMods::NEG; + Src = stripBitcast(Src); - if ((ModsTmp & SISrcMods::ABS) != 0) - Mods |= SISrcMods::ABS; - } + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); - // op_sel/op_sel_hi decide the source type and source. - // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. - // If the sources's op_sel is set, it picks the high half of the source - // register. + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; - Mods |= SISrcMods::OP_SEL_1; - if (isExtractHiElt(Src, Src)) { - Mods |= SISrcMods::OP_SEL_0; + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } - // TODO: Should we try to look for neg/abs here? - } + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the source + // register. - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } - return true; + Mods |= SISrcMods::OP_SEL_1; + if (IsExtractHigh || + (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { + Mods |= SISrcMods::OP_SEL_0; + + // TODO: Should we try to look for neg/abs here? } - return false; + // Prevent unnecessary subreg COPY to VGPR_16 + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + } + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16)) return false; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; @@ -3890,7 +4022,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - SelectVOP3PMadMixModsImpl(In, Src, Mods); + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16)) + return false; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16); SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } |