diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
62 files changed, 1092 insertions, 557 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 007b481..0059a86 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -439,10 +439,6 @@ struct AMDGPUPrintfRuntimeBindingPass PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; -struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); -}; - void initializeSIOptimizeExecMaskingPreRALegacyPass(PassRegistry &); extern char &SIOptimizeExecMaskingPreRAID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index f266398..8e4b636 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", "gfx12", - [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128, + [FeatureFP64, FeatureMIMG_R128, FeatureFlatAddressSpace, Feature16BitInsts, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts, @@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet< def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, + FeatureAddressableLocalMemorySize65536, FeatureLDSBankCount32, FeatureDLInsts, FeatureDot7Insts, @@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureCUStores, + FeatureAddressableLocalMemorySize327680, FeatureCuMode, Feature64BitLiterals, FeatureLDSBankCount32, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2a324e5..66c3fad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -41,6 +41,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCValue.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Compiler.h" @@ -733,6 +734,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutContext, IsLocal)); } + // Emit _dvgpr$ symbol when appropriate. + emitDVgprSymbol(MF); + if (isVerbose()) { MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); @@ -875,6 +879,49 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { return false; } +// When appropriate, add a _dvgpr$ symbol, with the value of the function +// symbol, plus an offset encoding one less than the number of VGPR blocks used +// by the function in bits 5..3 of the symbol value. A "VGPR block" can be +// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is +// used by a front-end to have functions that are chained rather than called, +// and a dispatcher that dynamically resizes the VGPR count before dispatching +// to a function. +void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) { + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + if (MFI.isDynamicVGPREnabled() && + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) { + MCContext &Ctx = MF.getContext(); + unsigned BlockSize = MFI.getDynamicVGPRBlockSize(); + MCValue NumVGPRs; + if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable( + NumVGPRs, nullptr) || + !NumVGPRs.isAbsolute()) { + llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol"); + } + // Calculate number of VGPR blocks. + // Treat 0 VGPRs as 1 VGPR to avoid underflowing. + unsigned NumBlocks = + divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize); + + if (NumBlocks > 8) { + OutContext.reportError({}, + "too many DVGPR blocks for _dvgpr$ symbol for '" + + Twine(CurrentFnSym->getName()) + "'"); + return; + } + unsigned EncodedNumBlocks = (NumBlocks - 1) << 3; + // Add to function symbol to create _dvgpr$ symbol. + const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd( + MCSymbolRefExpr::create(CurrentFnSym, Ctx), + MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx); + MCSymbol *DVgprFuncSym = + Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName()); + OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal); + emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility()); + emitLinkage(&MF.getFunction(), DVgprFuncSym); + } +} + // TODO: Fold this into emitFunctionBodyStart. void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { // In the beginning all features are either 'Any' or 'NotSupported', @@ -997,89 +1044,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const Function &F = MF.getFunction(); // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave - // dispatch registers are function args. - unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; - - if (isShader(F.getCallingConv())) { - bool IsPixelShader = - F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); - - // Calculate the number of VGPR registers based on the SPI input registers - uint32_t InputEna = 0; - uint32_t InputAddr = 0; - unsigned LastEna = 0; - - if (IsPixelShader) { - // Note for IsPixelShader: - // By this stage, all enabled inputs are tagged in InputAddr as well. - // We will use InputAddr to determine whether the input counts against the - // vgpr total and only use the InputEnable to determine the last input - // that is relevant - if extra arguments are used, then we have to honour - // the InputAddr for any intermediate non-enabled inputs. - InputEna = MFI->getPSInputEnable(); - InputAddr = MFI->getPSInputAddr(); - - // We only need to consider input args up to the last used arg. - assert((InputEna || InputAddr) && - "PSInputAddr and PSInputEnable should " - "never both be 0 for AMDGPU_PS shaders"); - // There are some rare circumstances where InputAddr is non-zero and - // InputEna can be set to 0. In this case we default to setting LastEna - // to 1. - LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; - } + // dispatch registers as function args. + unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(), + WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs(); - // FIXME: We should be using the number of registers determined during - // calling convention lowering to legalize the types. - const DataLayout &DL = F.getDataLayout(); - unsigned PSArgCount = 0; - unsigned IntermediateVGPR = 0; - for (auto &Arg : F.args()) { - unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; - if (Arg.hasAttribute(Attribute::InReg)) { - WaveDispatchNumSGPR += NumRegs; - } else { - // If this is a PS shader and we're processing the PS Input args (first - // 16 VGPR), use the InputEna and InputAddr bits to define how many - // VGPRs are actually used. - // Any extra VGPR arguments are handled as normal arguments (and - // contribute to the VGPR count whether they're used or not). - if (IsPixelShader && PSArgCount < 16) { - if ((1 << PSArgCount) & InputAddr) { - if (PSArgCount < LastEna) - WaveDispatchNumVGPR += NumRegs; - else - IntermediateVGPR += NumRegs; - } - PSArgCount++; - } else { - // If there are extra arguments we have to include the allocation for - // the non-used (but enabled with InputAddr) input arguments - if (IntermediateVGPR) { - WaveDispatchNumVGPR += IntermediateVGPR; - IntermediateVGPR = 0; - } - WaveDispatchNumVGPR += NumRegs; - } - } - } + if (WaveDispatchNumSGPR) { ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( - {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); + {ProgInfo.NumSGPR, + MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs, + Ctx)}, + Ctx); + } + if (WaveDispatchNumVGPR) { ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); - } else if (isKernel(F.getCallingConv()) && - MFI->getNumKernargPreloadedSGPRs()) { - // Consider cases where the total number of UserSGPRs with trailing - // allocated preload SGPRs, is greater than the number of explicitly - // referenced SGPRs. - const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd( - CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx); - ProgInfo.NumSGPR = - AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx); } // Adjust number of registers used to meet default/requested minimum/maximum @@ -1168,7 +1150,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; - if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { + if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) { + // LDS is allocated in 256 dword blocks. + LDSAlignShift = 10; + } else if (STM.getFeatureBits().test( + FeatureAddressableLocalMemorySize163840)) { // LDS is allocated in 320 dword blocks. LDSAlignShift = 11; } else if (STM.getFeatureBits().test( @@ -1205,8 +1191,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, CreateExpr(STM.getWavefrontSize()), Ctx), CreateExpr(1ULL << ScratchAlignShift)); - if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { + if (STM.supportsWGP()) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; + } + + if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.MemOrdered = 1; ProgInfo.FwdProgress = 1; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 63589d2..9e854fa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -54,6 +54,9 @@ private: MCCodeEmitter *DumpCodeInstEmitter = nullptr; + // When appropriate, add a _dvgpr$ symbol. + void emitDVgprSymbol(MachineFunction &MF); + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out, const SIProgramInfo &KernelInfo, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 3d8d274..d1a5b4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( ++i; } + if (Info->getNumKernargPreloadedSGPRs()) + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); + TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); return true; @@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!determineAssignments(Assigner, SplitArgs, CCInfo)) return false; + if (IsEntryFunc) { + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } + FormalArgHandler Handler(B, MRI); if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B)) return false; @@ -1464,9 +1476,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const { if (Function *F = Info.CB->getCalledFunction()) if (F->isIntrinsic()) { - assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain && - "Unexpected intrinsic"); - return lowerChainCall(MIRBuilder, Info); + switch (F->getIntrinsicID()) { + case Intrinsic::amdgcn_cs_chain: + return lowerChainCall(MIRBuilder, Info); + case Intrinsic::amdgcn_call_whole_wave: + Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave; + + // Get the callee from the original instruction, so it doesn't look like + // this is an indirect call. + Info.Callee = MachineOperand::CreateGA( + cast<GlobalValue>(Info.CB->getOperand(0)), /*Offset=*/0); + Info.OrigArgs.erase(Info.OrigArgs.begin()); + Info.IsVarArg = false; + break; + default: + llvm_unreachable("Unexpected intrinsic call"); + } } if (Info.IsVarArg) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td index 74d1fae..d14b5ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature< def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>; def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>; def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>; +def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>; class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature< "wavefrontsize"#!shl(1, ValueLog2), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 9d6584a..04c4d00 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -76,6 +76,40 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) { return false; } +static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src, + llvm::SelectionDAG *CurDAG, + const GCNSubtarget *Subtarget) { + if (!Subtarget->useRealTrue16Insts()) { + return Lo; + } + + SDValue NewSrc; + SDLoc SL(Lo); + + if (Lo->isDivergent()) { + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + SL, Lo.getValueType()), + 0); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo, + CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef, + CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)}; + + NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL, + Src.getValueType(), Ops), + 0); + } else { + // the S_MOV is needed since the Lo could still be a VGPR16. + // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on + // the fixvgpr2sgprcopy pass to legalize it + NewSrc = SDValue( + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo), + 0); + } + + return NewSrc; +} + // Look through operations that obscure just looking at the low 16-bits of the // same register. static SDValue stripExtractLoElt(SDValue In) { @@ -1162,18 +1196,25 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == ISD::SMUL_LOHI; + SDVTList VTList; unsigned Opc; - if (Subtarget->hasMADIntraFwdBug()) - Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 - : AMDGPU::V_MAD_U64_U32_gfx11_e64; - else - Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + if (Subtarget->hasMadU64U32NoCarry()) { + VTList = CurDAG->getVTList(MVT::i64); + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; + } else { + VTList = CurDAG->getVTList(MVT::i64, MVT::i1); + if (Subtarget->hasMADIntraFwdBug()) { + Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 + : AMDGPU::V_MAD_U64_U32_gfx11_e64; + } else { + Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + } + } SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64); SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp}; - SDNode *Mad = CurDAG->getMachineNode( - Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops); + SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops); if (!SDValue(N, 0).use_empty()) { SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32); SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL, @@ -3412,8 +3453,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, // Really a scalar input. Just select from the low half of the register to // avoid packing. - if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) { + if (VecSize == Lo.getValueSizeInBits()) { Src = Lo; + } else if (VecSize == 32) { + Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget); } else { assert(Lo.getValueSizeInBits() == 32 && VecSize == 64); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 64e68ab..a28e272 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4002,7 +4002,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_rsq_clamp: - case Intrinsic::amdgcn_tanh: { + case Intrinsic::amdgcn_tanh: + case Intrinsic::amdgcn_prng_b32: { // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(1); return Src.isUndef() ? Src : SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b7fd131..5d31eed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2368,8 +2368,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: return selectDSBvhStackIntrinsic(I); + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: return selectNamedBarrierInit(I, IntrinsicID); + case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_get_named_barrier_state: return selectNamedBarrierInst(I, IntrinsicID); case Intrinsic::amdgcn_s_get_barrier_state: @@ -5521,11 +5523,18 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, Register PtrBase; int64_t ConstOffset; - std::tie(PtrBase, ConstOffset) = + bool IsInBounds; + std::tie(PtrBase, ConstOffset, IsInBounds) = getPtrBaseWithConstantOffset(Root.getReg(), *MRI); - if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && - !isFlatScratchBaseLegal(Root.getReg()))) + // Adding the offset to the base address with an immediate in a FLAT + // instruction must not change the memory aperture in which the address falls. + // Therefore we can only fold offsets from inbounds GEPs into FLAT + // instructions. + if (ConstOffset == 0 || + (FlatVariant == SIInstrFlags::FlatScratch && + !isFlatScratchBaseLegal(Root.getReg())) || + (FlatVariant == SIInstrFlags::FLAT && !IsInBounds)) return Default; unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); @@ -5577,7 +5586,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0) { if (NeedIOffset && @@ -5760,7 +5770,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, @@ -5836,7 +5847,8 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { // Match the immediate offset first, which canonically is moved as low as // possible. - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(Addr, *MRI); Register OrigAddr = Addr; if (ConstOffset != 0 && @@ -5942,7 +5954,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); Register PtrBase; int64_t ConstOffset; - std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); + std::tie(PtrBase, ConstOffset, std::ignore) = + getPtrBaseWithConstantOffset(VAddr, *MRI); if (ConstOffset != 0) { if (TII.isLegalMUBUFImmOffset(ConstOffset) && (!STI.privateMemoryResourceIsRangeChecked() || @@ -6181,8 +6194,8 @@ AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = - getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { if (isDSOffsetLegal(PtrBase, Offset)) { @@ -6243,8 +6256,8 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = - getPtrBaseWithConstantOffset(Root.getReg(), *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { int64_t OffsetValue0 = Offset; @@ -6265,22 +6278,25 @@ AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, } /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return -/// the base value with the constant offset. There may be intervening copies -/// between \p Root and the identified constant. Returns \p Root, 0 if this does -/// not match the pattern. -std::pair<Register, int64_t> +/// the base value with the constant offset, and if the offset computation is +/// known to be inbounds. There may be intervening copies between \p Root and +/// the identified constant. Returns \p Root, 0, false if this does not match +/// the pattern. +std::tuple<Register, int64_t, bool> AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( - Register Root, const MachineRegisterInfo &MRI) const { + Register Root, const MachineRegisterInfo &MRI) const { MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) - return {Root, 0}; + return {Root, 0, false}; MachineOperand &RHS = RootI->getOperand(2); std::optional<ValueAndVReg> MaybeOffset = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); if (!MaybeOffset) - return {Root, 0}; - return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; + return {Root, 0, false}; + bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds); + return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(), + IsInBounds}; } static void addZeroImm(MachineInstrBuilder &MIB) { @@ -6358,7 +6374,8 @@ AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { Register PtrBase; int64_t Offset; - std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); + std::tie(PtrBase, Offset, std::ignore) = + getPtrBaseWithConstantOffset(Src, *MRI); if (isUInt<32>(Offset)) { Data.N0 = PtrBase; Data.Offset = Offset; @@ -6757,6 +6774,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { switch (IntrID) { default: llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_IMM; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_IMM; }; @@ -6764,6 +6783,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { switch (IntrID) { default: llvm_unreachable("not a named barrier op"); + case Intrinsic::amdgcn_s_barrier_join: + return AMDGPU::S_BARRIER_JOIN_M0; case Intrinsic::amdgcn_s_get_named_barrier_state: return AMDGPU::S_GET_BARRIER_STATE_M0; }; @@ -6814,8 +6835,11 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4); constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); + unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init + ? AMDGPU::S_BARRIER_INIT_M0 + : AMDGPU::S_BARRIER_SIGNAL_M0; MachineInstrBuilder MIB; - MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0)); + MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); I.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index c9da419..0924396 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -156,6 +156,7 @@ private: bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const; + bool selectSBarrierLeave(MachineInstr &I) const; std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src, bool IsCanonicalizing = true, @@ -295,7 +296,7 @@ private: InstructionSelector::ComplexRendererFns selectDSReadWrite2(MachineOperand &Root, unsigned size) const; - std::pair<Register, int64_t> + std::tuple<Register, int64_t, bool> getPtrBaseWithConstantOffset(Register Root, const MachineRegisterInfo &MRI) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 523c66c..56113e6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -545,7 +545,8 @@ public: AU.addRequired<TargetPassConfig>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<UniformityInfoWrapperPass>(); - AU.setPreservesAll(); + // Invalidates UniformityInfo + AU.setPreservesCFG(); } bool runOnFunction(Function &F) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 40d960e..600a130 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" @@ -137,6 +138,14 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +// Retrieves the scalar type that's the same size as the mem desc +static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + return std::make_pair(TypeIdx, LLT::scalar(MemSize)); + }; +} + // Increase the number of vector elements to reach the next legal RegClass. static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { @@ -384,6 +393,16 @@ static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { }; } +// If we have a truncating store or an extending load with a data size larger +// than 32-bits and mem location is a power of 2 +static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + return isWideScalarExtLoadTruncStore(TypeIdx)(Query) && + isPowerOf2_64(MemSize); + }; +} + // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we // handle some operations by just promoting the register during // selection. There are also d16 loads on GFX9+ which preserve the high bits. @@ -1635,11 +1654,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // May need relegalization for the scalars. return std::pair(0, EltTy); }) - .minScalar(0, S32) - .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) - .widenScalarToNextPow2(0) - .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) - .lower(); + .minScalar(0, S32) + .narrowScalarIf(isTruncStoreToSizePowerOf2(0), + getScalarTypeFromMemDesc(0)) + .widenScalarToNextPow2(0) + .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) + .lower(); } // FIXME: Unaligned accesses not lowered. @@ -5653,7 +5673,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, unsigned SplitSize = 32; if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) && ST.hasDPALU_DPP() && - AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm())) + AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm())) SplitSize = 64; if (Size == SplitSize) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 304e91e..139cad6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -599,8 +599,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) { IRB.SetInsertPoint(&SI); Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName()); - for (auto *Dbg : at::getAssignmentMarkers(&SI)) - Dbg->setValue(IntV); + for (auto *Dbg : at::getDVRAssignmentMarkers(&SI)) + Dbg->setRawLocation(ValueAsMetadata::get(IntV)); SI.setOperand(0, IntV); return true; @@ -1361,6 +1361,7 @@ public: PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI); PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP); + PtrParts visitPtrToAddrInst(PtrToAddrInst &PA); PtrParts visitPtrToIntInst(PtrToIntInst &PI); PtrParts visitIntToPtrInst(IntToPtrInst &IP); PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I); @@ -1954,6 +1955,21 @@ PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { return {nullptr, nullptr}; } +PtrParts SplitPtrStructs::visitPtrToAddrInst(PtrToAddrInst &PA) { + Value *Ptr = PA.getPointerOperand(); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&PA); + + auto [Rsrc, Off] = getPtrParts(Ptr); + Value *Res = IRB.CreateIntCast(Off, PA.getType(), /*isSigned=*/false); + copyMetadata(Res, &PA); + Res->takeName(&PA); + SplitUsers.insert(&PA); + PA.replaceAllUsesWith(Res); + return {nullptr, nullptr}; +} + PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { if (!isSplitFatPtr(IP.getType())) return {nullptr, nullptr}; @@ -2350,8 +2366,12 @@ static bool containsBufferFatPointers(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { bool HasFatPointers = false; for (const BasicBlock &BB : F) - for (const Instruction &I : BB) + for (const Instruction &I : BB) { HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); + // Catch null pointer constants in loads, stores, etc. + for (const Value *V : I.operand_values()) + HasFatPointers |= (V->getType() != TypeMap->remapType(V->getType())); + } return HasFatPointers; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index aa72c3e..dfe7c53 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -352,7 +352,10 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { case Intrinsic::amdgcn_s_barrier_signal: case Intrinsic::amdgcn_s_barrier_signal_var: case Intrinsic::amdgcn_s_barrier_signal_isfirst: + case Intrinsic::amdgcn_s_barrier_init: + case Intrinsic::amdgcn_s_barrier_join: case Intrinsic::amdgcn_s_barrier_wait: + case Intrinsic::amdgcn_s_barrier_leave: case Intrinsic::amdgcn_s_get_barrier_state: case Intrinsic::amdgcn_wave_barrier: case Intrinsic::amdgcn_sched_barrier: @@ -381,7 +384,7 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, AAResults *AA) { MemorySSAWalker *Walker = MSSA->getWalker(); SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)}; - SmallSet<MemoryAccess *, 8> Visited; + SmallPtrSet<MemoryAccess *, 8> Visited; MemoryLocation Loc(MemoryLocation::get(Load)); LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index b6c6d92..6ddfa38 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -29,7 +29,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 3a37518..28d5400 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -134,8 +134,8 @@ static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); - SmallSet<const Value *, 32> WorkSet; - SmallSet<const Value *, 32> Visited; + SmallPtrSet<const Value *, 32> WorkSet; + SmallPtrSet<const Value *, 32> Visited; if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { if (isGlobalAddr(MO)) WorkSet.insert(MO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp index 4009451..90c4f4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp @@ -109,7 +109,7 @@ AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF) TRI(*ST.getRegisterInfo()) {} bool AMDGPUPreloadKernArgProlog::run() { - if (!ST.hasKernargPreload()) + if (!ST.needsKernArgPreloadProlog()) return false; unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp index 984c1ee..a386fe6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp @@ -37,6 +37,11 @@ static cl::opt<unsigned> KernargPreloadCount( "amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); +static cl::opt<bool> + EnableKernargPreload("amdgpu-kernarg-preload", + cl::desc("Enable preload kernel arguments to SGPRs"), + cl::init(true)); + namespace { class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass { @@ -275,6 +280,9 @@ AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy( : ModulePass(ID), TM(TM) {} static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) { + if (!EnableKernargPreload) + return false; + SmallVector<Function *, 4> FunctionsToErase; bool Changed = false; for (auto &F : M) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 5a6ad40..8c56c21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -724,10 +724,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}}); addRulesForGOpcs({G_PTR_ADD}) - .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}}) - .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}) - .Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}}) - .Any({{UniP4}, {{SgprP4}, {SgprP4, Sgpr64}}}); + .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}}) + .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}}) + .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}}) + .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}}); addRulesForGOpcs({G_INTTOPTR}) .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 868b1a2..2379296 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3342,6 +3342,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl( assert(OpdMapper.getVRegs(1).empty()); constrainOpWithReadfirstlane(B, MI, 1); return; + case Intrinsic::amdgcn_s_barrier_join: + constrainOpWithReadfirstlane(B, MI, 1); + return; + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: constrainOpWithReadfirstlane(B, MI, 1); constrainOpWithReadfirstlane(B, MI, 2); @@ -5515,6 +5519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_s_sleep_var: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); break; + case Intrinsic::amdgcn_s_barrier_join: + OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + break; + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp index e2e5c57..d2ec7dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp @@ -195,13 +195,17 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) { // Delete FeatureWavefrontSize32 functions for // gfx9 and below targets that don't support the mode. - // gfx10+ is implied to support both wave32 and 64 features. + // gfx10, gfx11, gfx12 are implied to support both wave32 and 64 features. // They are not in the feature set. So, we need a separate check - if (ST->getGeneration() < AMDGPUSubtarget::GFX10 && - ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) { + if (!ST->supportsWave32() && ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) { reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32); return true; } + // gfx125x only support FeatureWavefrontSize32. + if (!ST->supportsWave64() && ST->hasFeature(AMDGPU::FeatureWavefrontSize64)) { + reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize64); + return true; + } return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 8101c68..ccd2de1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -241,6 +241,9 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( if (!RC || !TRI.isVGPRClass(RC)) continue; + if (MI.isCall() || MI.isMetaInstruction()) + continue; + unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32); unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index f580f43..20b5fd9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -57,27 +57,47 @@ public: TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), LIS(LIS) {} + // TODO: Remove this restriction + bool mfmaHasSameSrc2AndDstReg(const MachineInstr &MI) const { + const MachineOperand *Src2 = TII.getNamedOperand(MI, AMDGPU::OpName::src2); + const MachineOperand *Dst = TII.getNamedOperand(MI, AMDGPU::OpName::vdst); + return Src2->getReg() == Dst->getReg() && + Src2->getSubReg() == Dst->getSubReg(); + } + + bool isRewriteCandidate(const MachineInstr &MI) const { + return TII.isMAI(MI) && + AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1 && + mfmaHasSameSrc2AndDstReg(MI); + } + /// Compute the register class constraints based on the uses of \p Reg, - /// excluding uses from \p ExceptMI. This should be nearly identical to + /// excluding MFMA uses from which can be rewritten to change the register + /// class constraint. This should be nearly identical to /// MachineRegisterInfo::recomputeRegClass. const TargetRegisterClass * - recomputeRegClassExcept(Register Reg, const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC, - const MachineInstr *ExceptMI) const; + recomputeRegClassExceptRewritable(Register Reg, + const TargetRegisterClass *OldRC, + const TargetRegisterClass *NewRC) const; bool run(MachineFunction &MF) const; }; const TargetRegisterClass * -AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExcept( +AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( Register Reg, const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const { + const TargetRegisterClass *NewRC) const { // Accumulate constraints from all uses. for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { // Apply the effect of the given operand to NewRC. MachineInstr *MI = MO.getParent(); - if (MI == ExceptMI) + + // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the + // effects of rewrite candidates. It just so happens that we can use either + // AGPR or VGPR in src0/src1, so don't bother checking the constraint + // effects of the individual operands. + if (isRewriteCandidate(*MI)) continue; unsigned OpNo = &MO - &MI->getOperand(0); @@ -96,8 +116,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { return false; // Early exit if no AGPRs were assigned. - if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) + if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) { + LLVM_DEBUG(dbgs() << "skipping function that did not allocate AGPRs\n"); return false; + } bool MadeChange = false; @@ -109,17 +131,25 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // Find AV_* registers assigned to AGPRs. const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); - if (!TRI.isVectorSuperClass(VirtRegRC)) + if (!TRI.hasAGPRs(VirtRegRC)) continue; - const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); - if (!TRI.isAGPRClass(AssignedRC)) - continue; + const TargetRegisterClass *AssignedRC = VirtRegRC; + if (TRI.hasVGPRs(VirtRegRC)) { + // If this is an AV register, we have to check if the actual assignment is + // to an AGPR + AssignedRC = TRI.getPhysRegBaseClass(PhysReg); + if (!TRI.isAGPRClass(AssignedRC)) + continue; + } LiveInterval &LI = LIS.getInterval(VReg); // TODO: Test multiple uses for (VNInfo *VNI : LI.vnis()) { + if (VNI->isPHIDef() || VNI->isUnused()) + continue; + MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); // TODO: Handle SplitKit produced copy bundles for partially defined @@ -183,10 +213,13 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // first place, as well as need to assign another register, and need to // figure out where to put them. The live range splitting is smarter than // anything we're doing here, so trust it did something reasonable. - const TargetRegisterClass *Src2ExceptRC = recomputeRegClassExcept( - Src2->getReg(), Src2VirtRegRC, VirtRegRC, CopySrcMI); - if (!Src2ExceptRC) + const TargetRegisterClass *Src2ExceptRC = + recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC, + VirtRegRC); + if (!Src2ExceptRC) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n"); continue; + } const TargetRegisterClass *NewSrc2ConstraintRC = TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF); @@ -196,8 +229,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { const TargetRegisterClass *NewSrc2RC = TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC); if (!NewSrc2RC) { - // TODO: This is ignoring ther rewritable uses. e.g. a rewritable MFMA - // using a rewritable MFMA can be rewritten as a pair. LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI) << " are incompatible with replacement class\n"); continue; @@ -208,8 +239,19 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { CopySrcMI->setDesc(TII.get(AGPROp)); - // TODO: Is replacing too aggressive, fixup these instructions only? - MRI.replaceRegWith(CopySrcReg, VReg); + // Perform replacement of the register, rewriting the rewritable uses. + for (MachineInstr &UseMI : + make_early_inc_range(MRI.reg_instructions(CopySrcReg))) { + if (TII.isMAI(UseMI)) { + // Note the register we need to rewrite may still appear in src0/src1, + // but that's fine since those can use A or V anyway. + int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(UseMI.getOpcode()); + if (ReplacementOp != -1) + UseMI.setDesc(TII.get(ReplacementOp)); + } + + UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI); + } LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 10b8606..7be1899 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -378,6 +378,7 @@ foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence<intr>; def : SourceOfDivergence<int_amdgcn_dead>; +def : SourceOfDivergence<int_amdgcn_call_whole_wave>; class AlwaysUniform<Intrinsic intr> { Intrinsic Intr = intr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp index b60ded3..56aa3f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -195,7 +195,7 @@ bool AMDGPUSetWavePriority::run(MachineFunction &MF) { // Lower the priority on edges where control leaves blocks from which // the VMEM loads are reachable. - SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks; + SmallPtrSet<MachineBasicBlock *, 16> PriorityLoweringBlocks; for (MachineBasicBlock &MBB : MF) { if (MBBInfos[&MBB].MayReachVMEMLoad) { if (MBB.succ_empty()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c1f1703..e393aa19 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -848,8 +848,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (Level == OptimizationLevel::O0) return; - PM.addPass(AMDGPUUnifyMetadataPass()); - // We don't want to run internalization at per-module stage. if (InternalizeSymbols && !isLTOPreLink(Phase)) { PM.addPass(InternalizePass(mustPreserveGV)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp deleted file mode 100644 index e400491..0000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ /dev/null @@ -1,119 +0,0 @@ -//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// \file -// This pass that unifies multiple OpenCL metadata due to linking. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Pass.h" - -using namespace llvm; - -namespace { - - namespace kOCLMD { - - const char SpirVer[] = "opencl.spir.version"; - const char OCLVer[] = "opencl.ocl.version"; - const char UsedExt[] = "opencl.used.extensions"; - const char UsedOptCoreFeat[] = "opencl.used.optional.core.features"; - const char CompilerOptions[] = "opencl.compiler.options"; - const char LLVMIdent[] = "llvm.ident"; - - } // end namespace kOCLMD - - /// Unify version metadata. - /// \return true if changes are made. - /// Assume the named metadata has operands each of which is a pair of - /// integer constant, e.g. - /// !Name = {!n1, !n2} - /// !n1 = {i32 1, i32 2} - /// !n2 = {i32 2, i32 0} - /// Keep the largest version as the sole operand if PickFirst is false. - /// Otherwise pick it from the first value, representing kernel module. - bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) { - auto *NamedMD = M.getNamedMetadata(Name); - if (!NamedMD || NamedMD->getNumOperands() <= 1) - return false; - MDNode *MaxMD = nullptr; - auto MaxVer = 0U; - for (auto *VersionMD : NamedMD->operands()) { - assert(VersionMD->getNumOperands() == 2); - auto *CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0)); - auto VersionMajor = CMajor->getZExtValue(); - auto *CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1)); - auto VersionMinor = CMinor->getZExtValue(); - auto Ver = (VersionMajor * 100) + (VersionMinor * 10); - if (Ver > MaxVer) { - MaxVer = Ver; - MaxMD = VersionMD; - } - if (PickFirst) - break; - } - NamedMD->eraseFromParent(); - NamedMD = M.getOrInsertNamedMetadata(Name); - NamedMD->addOperand(MaxMD); - return true; - } - - /// Unify version metadata. - /// \return true if changes are made. - /// Assume the named metadata has operands each of which is a list e.g. - /// !Name = {!n1, !n2} - /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}} - /// !n2 = !{!"cl_khr_image"} - /// Combine it into a single list with unique operands. - bool unifyExtensionMD(Module &M, StringRef Name) { - auto *NamedMD = M.getNamedMetadata(Name); - if (!NamedMD || NamedMD->getNumOperands() == 1) - return false; - - SmallVector<Metadata *, 4> All; - for (auto *MD : NamedMD->operands()) - for (const auto &Op : MD->operands()) - if (!llvm::is_contained(All, Op.get())) - All.push_back(Op.get()); - - NamedMD->eraseFromParent(); - NamedMD = M.getOrInsertNamedMetadata(Name); - for (const auto &MD : All) - NamedMD->addOperand(MDNode::get(M.getContext(), MD)); - - return true; - } - - /// Unify multiple OpenCL metadata due to linking. - bool unifyMetadataImpl(Module &M) { - const char *Vers[] = {kOCLMD::SpirVer, kOCLMD::OCLVer}; - const char *Exts[] = {kOCLMD::UsedExt, kOCLMD::UsedOptCoreFeat, - kOCLMD::CompilerOptions, kOCLMD::LLVMIdent}; - - bool Changed = false; - - for (auto &I : Vers) - Changed |= unifyVersionMD(M, I, true); - - for (auto &I : Exts) - Changed |= unifyExtensionMD(M, I); - - return Changed; - } - - } // end anonymous namespace - - PreservedAnalyses AMDGPUUnifyMetadataPass::run(Module &M, - ModuleAnalysisManager &AM) { - return unifyMetadataImpl(M) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); - } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 0d2feeb..9514732 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5052,11 +5052,13 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, if (DppCtrlIdx >= 0) { unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm(); - if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) && - AMDGPU::isDPALU_DPP(MII.get(Opc))) { - // DP ALU DPP is supported for row_newbcast only on GFX9* + if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) && + AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) { + // DP ALU DPP is supported for row_newbcast only on GFX9* and row_share + // only on GFX12. SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands); - Error(S, "DP ALU dpp only supports row_newbcast"); + Error(S, isGFX12() ? "DP ALU dpp only supports row_share" + : "DP ALU dpp only supports row_newbcast"); return false; } } @@ -6268,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ExprVal, ValRange); } else if (ID == ".amdhsa_workgroup_processor_mode") { - if (IVersion.Major < 10) - return Error(IDRange.Start, "directive requires gfx10+", IDRange); + if (!supportsWGP(getSTI())) + return Error(IDRange.Start, + "directive unsupported on " + getSTI().getCPU(), IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal, ValRange); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c466f9c..dc9dd22 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -114,7 +114,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetTransformInfo.cpp AMDGPUWaitSGPRHazards.cpp AMDGPUUnifyDivergentExitNodes.cpp - AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp GCNCreateVOPD.cpp GCNDPPCombine.cpp diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index d5d1074..f5d4384 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1274,7 +1274,7 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>; } -let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in { +let WaveSizePredicate = isWave32, SubtargetPredicate = HasTransposeLoadF4F6Insts in { defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>; defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>; } diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index f9a907a..184929a 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -421,6 +421,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) { DPPInst.addImm(ByteSelOpr->getImm()); } + if (MachineOperand *BitOp3 = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::bitop3)) { + assert(AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::bitop3)); + DPPInst.add(*BitOp3); + } } DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); @@ -544,11 +549,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { return false; } - if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || - MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) { - auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); - assert(DppCtrl && DppCtrl->isImm()); - if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) { + auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); + assert(DppCtrl && DppCtrl->isImm()); + unsigned DppCtrlVal = DppCtrl->getImm(); + if ((MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp)) { + if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP)) { + LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move is unsupported\n"); + // Split it. + return false; + } + if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal)) { LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" " control value\n"); // Let it split, then control may become legal. @@ -704,6 +715,20 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { break; } + if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP) && + AMDGPU::isDPALU_DPP32BitOpc(OrigOp)) { + LLVM_DEBUG(dbgs() << " " << OrigMI + << " failed: DPP ALU DPP is not supported\n"); + break; + } + + if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) && + AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) { + LLVM_DEBUG(dbgs() << " " << OrigMI + << " failed: not valid 64-bit DPP control value\n"); + break; + } + LLVM_DEBUG(dbgs() << " combining: " << OrigMI); if (Use == Src0) { if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR, diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 96cb5ae..a3b64ae 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1200,6 +1200,14 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); fixRequiredExportPriority(MI); + if (ST.requiresWaitIdleBeforeGetReg()) + fixGetRegWaitIdle(MI); + if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug()) + fixDsAtomicAsyncBarrierArriveB64(MI); + if (ST.hasScratchBaseForwardingHazard()) + fixScratchBaseForwardingHazard(MI); + if (ST.setRegModeNeedsVNOPs()) + fixSetRegMode(MI); } static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, @@ -1350,6 +1358,9 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { return (Decoded.DsCnt == 0); } default: + assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) || + MI.getOpcode() == AMDGPU::S_WAIT_IDLE) && + "unexpected wait count instruction"); // SOPP instructions cannot mitigate the hazard. if (TII->isSOPP(MI)) return false; @@ -1731,7 +1742,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0x0fff); + .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); return true; } @@ -1781,7 +1792,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - I.getOperand(0).getImm() == 0x0fff)) + AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) return HazardExpired; // Track registers writes @@ -2239,19 +2250,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) return true; - switch (MI.getOpcode()) { - case AMDGPU::S_WAITCNT: - case AMDGPU::S_WAITCNT_VSCNT: - case AMDGPU::S_WAITCNT_VMCNT: - case AMDGPU::S_WAITCNT_EXPCNT: - case AMDGPU::S_WAITCNT_LGKMCNT: - case AMDGPU::S_WAIT_IDLE: - return true; - default: - break; - } - - return false; + return SIInstrInfo::isWaitcnt(MI.getOpcode()); }; return FPAtomicToDenormModeWaitStates - @@ -3428,3 +3427,125 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) { return true; } + +bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) { + if (!isSGetReg(MI->getOpcode())) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + switch (getHWReg(TII, *MI)) { + default: + return false; + case AMDGPU::Hwreg::ID_STATUS: + case AMDGPU::Hwreg::ID_STATE_PRIV: + case AMDGPU::Hwreg::ID_EXCP_FLAG_PRIV: + case AMDGPU::Hwreg::ID_EXCP_FLAG_USER: + break; + } + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0); + return true; +} + +bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xFFE3); + BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xFFE3); + + return true; +} + +bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) { + // No reason to check this in pre-RA scheduling, SGPRs have to be allocated + // for hazard to trigger. + if (!IsHazardRecognizerMode) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU. + const int FlatScrBaseWaitStates = 10; + + bool ReadsFlatScrLo = + MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI); + bool ReadsFlatScrHi = + MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI); + if (isSGetReg(MI->getOpcode())) { + switch (getHWReg(TII, *MI)) { + default: + break; + case AMDGPU::Hwreg::ID_FLAT_SCR_LO: + ReadsFlatScrLo = true; + break; + case AMDGPU::Hwreg::ID_FLAT_SCR_HI: + ReadsFlatScrHi = true; + break; + } + } + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IsRegDefHazard = [&](Register Reg) -> bool { + DenseSet<const MachineBasicBlock *> Visited; + auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) { + return MI.modifiesRegister(Reg, TRI); + }; + + // This literally abuses the idea of waitstates. Instead of waitstates it + // returns 1 for SGPR written and 0 otherwise. + auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned { + if (!TII->isSALU(MI) && !TII->isVALU(MI)) + return 0; + for (const MachineOperand &MO : MI.all_defs()) { + if (TRI->isSGPRReg(MRI, MO.getReg())) + return 1; + } + return 0; + }; + + auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) { + if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) { + unsigned Wait = MI.getOperand(0).getImm(); + if (AMDGPU::DepCtr::decodeFieldSaSdst(Wait) == 0 && + AMDGPU::DepCtr::decodeFieldVaSdst(Wait) == 0) + return true; + } + return SgprWrites >= FlatScrBaseWaitStates; + }; + + return ::getWaitStatesSince( + IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()), + 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates; + }; + + if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) || + !IsRegDefHazard(AMDGPU::SGPR102)) && + (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) || + !IsRegDefHazard(AMDGPU::SGPR103))) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldVaSdst( + AMDGPU::DepCtr::encodeFieldSaSdst(0), 0)); + return true; +} + +bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) { + if (!isSSetReg(MI->getOpcode()) || + MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32)); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32)); + return true; +} diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index f796eeae..67beffa 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -110,6 +110,10 @@ private: bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); + bool fixGetRegWaitIdle(MachineInstr *MI); + bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI); + bool fixScratchBaseForwardingHazard(MachineInstr *MI); + bool fixSetRegMode(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 334afd3..ef63acc 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -368,46 +368,45 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, //////////////////////////////////////////////////////////////////////////////// // GCNRPTarget -GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { +GCNRPTarget::GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - setRegLimits(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F), MF); + setTarget(ST.getMaxNumSGPRs(F), ST.getMaxNumVGPRs(F)); } GCNRPTarget::GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { - setRegLimits(NumSGPRs, NumVGPRs, MF); + const MachineFunction &MF, const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { + setTarget(NumSGPRs, NumVGPRs); } GCNRPTarget::GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings) - : RP(RP), CombineVGPRSavings(CombineVGPRSavings) { + const GCNRegPressure &RP) + : GCNRPTarget(RP, MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned DynamicVGPRBlockSize = MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - setRegLimits(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), - ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize), MF); + setTarget(ST.getMaxNumSGPRs(Occupancy, /*Addressable=*/false), + ST.getMaxNumVGPRs(Occupancy, DynamicVGPRBlockSize)); } -void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs, - const MachineFunction &MF) { +void GCNRPTarget::setTarget(unsigned NumSGPRs, unsigned NumVGPRs) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs); MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs); - MaxUnifiedVGPRs = - ST.hasGFX90AInsts() - ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs) - : 0; + if (UnifiedRF) { + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxUnifiedVGPRs = + std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs); + } else { + MaxUnifiedVGPRs = 0; + } } -bool GCNRPTarget::isSaveBeneficial(Register Reg, - const MachineRegisterInfo &MRI) const { +bool GCNRPTarget::isSaveBeneficial(Register Reg) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); @@ -416,16 +415,19 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg, return RP.getSGPRNum() > MaxSGPRs; unsigned NumVGPRs = SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum(); - return isVGPRBankSaveBeneficial(NumVGPRs); + // The addressable limit must always be respected. + if (NumVGPRs > MaxVGPRs) + return true; + // For unified RFs, combined VGPR usage limit must be respected as well. + return UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs; } bool GCNRPTarget::satisfied() const { - if (RP.getSGPRNum() > MaxSGPRs) + if (RP.getSGPRNum() > MaxSGPRs || RP.getVGPRNum(false) > MaxVGPRs) return false; - if (RP.getVGPRNum(false) > MaxVGPRs && - (!CombineVGPRSavings || !satisifiesVGPRBanksTarget())) + if (UnifiedRF && RP.getVGPRNum(true) > MaxUnifiedVGPRs) return false; - return satisfiesUnifiedTarget(); + return true; } /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ea33a22..a9c58bb 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -186,20 +186,22 @@ public: /// Sets up the target such that the register pressure starting at \p RP does /// not show register spilling on function \p MF (w.r.t. the function's /// mininum target occupancy). - GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP, - bool CombineVGPRSavings = false); + GCNRPTarget(const MachineFunction &MF, const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not use more than \p NumSGPRs SGPRs and \p NumVGPRs VGPRs on function \p /// MF. GCNRPTarget(unsigned NumSGPRs, unsigned NumVGPRs, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); /// Sets up the target such that the register pressure starting at \p RP does /// not prevent achieving an occupancy of at least \p Occupancy on function /// \p MF. GCNRPTarget(unsigned Occupancy, const MachineFunction &MF, - const GCNRegPressure &RP, bool CombineVGPRSavings = false); + const GCNRegPressure &RP); + + /// Changes the target (same semantics as constructor). + void setTarget(unsigned NumSGPRs, unsigned NumVGPRs); const GCNRegPressure &getCurrentRP() const { return RP; } @@ -207,7 +209,7 @@ public: /// Determines whether saving virtual register \p Reg will be beneficial /// towards achieving the RP target. - bool isSaveBeneficial(Register Reg, const MachineRegisterInfo &MRI) const; + bool isSaveBeneficial(Register Reg) const; /// Saves virtual register \p Reg with lanemask \p Mask. void saveReg(Register Reg, LaneBitmask Mask, const MachineRegisterInfo &MRI) { @@ -227,15 +229,15 @@ public: if (Target.MaxUnifiedVGPRs) { OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)"; - } else if (Target.CombineVGPRSavings) { - OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/' - << 2 * Target.MaxVGPRs << " VGPRs (combined target)"; } return OS; } #endif private: + const MachineFunction &MF; + const bool UnifiedRF; + /// Current register pressure. GCNRegPressure RP; @@ -246,29 +248,10 @@ private: /// Target number of overall VGPRs for subtargets with unified RFs. Always 0 /// for subtargets with non-unified RFs. unsigned MaxUnifiedVGPRs; - /// Whether we consider that the register allocator will be able to swap - /// between ArchVGPRs and AGPRs by copying them to a super register class. - /// Concretely, this allows savings in one of the VGPR banks to help toward - /// savings in the other VGPR bank. - bool CombineVGPRSavings; - - inline bool satisifiesVGPRBanksTarget() const { - assert(CombineVGPRSavings && "only makes sense with combined savings"); - return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs; - } - - /// Always satisified when the subtarget doesn't have a unified RF. - inline bool satisfiesUnifiedTarget() const { - return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs; - } - - inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const { - return NumVGPRs > MaxVGPRs || !satisfiesUnifiedTarget() || - (CombineVGPRSavings && !satisifiesVGPRBanksTarget()); - } - void setRegLimits(unsigned MaxSGPRs, unsigned MaxVGPRs, - const MachineFunction &MF); + GCNRPTarget(const GCNRegPressure &RP, const MachineFunction &MF) + : MF(MF), UnifiedRF(MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()), + RP(RP) {} }; /////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 96d5668..254b75b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1086,7 +1086,8 @@ bool ClusteredLowOccStage::initGCNSchedStage() { } /// Allows to easily filter for this stage's debug output. -#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;) +#define REMAT_PREFIX "[PreRARemat] " +#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) bool PreRARematStage::initGCNSchedStage() { // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for @@ -1115,10 +1116,15 @@ bool PreRARematStage::initGCNSchedStage() { rematerialize(); if (GCNTrackers) DAG.RegionLiveOuts.buildLiveRegMap(); - REMAT_DEBUG( - dbgs() << "Retrying function scheduling with new min. occupancy of " - << AchievedOcc << " from rematerializing (original was " - << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n"); + REMAT_DEBUG({ + dbgs() << "Retrying function scheduling with new min. occupancy of " + << AchievedOcc << " from rematerializing (original was " + << DAG.MinOccupancy; + if (TargetOcc) + dbgs() << ", target was " << *TargetOcc; + dbgs() << ")\n"; + }); + if (AchievedOcc > DAG.MinOccupancy) { DAG.MinOccupancy = AchievedOcc; SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -1540,8 +1546,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { return GCNSchedStage::shouldRevertScheduling(WavesAfter) || - mayCauseSpilling(WavesAfter) || - (IncreaseOccupancy && WavesAfter < TargetOcc); + mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc); } bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -1687,78 +1692,63 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, } bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { - REMAT_DEBUG({ - dbgs() << "Collecting rematerializable instructions in "; - MF.getFunction().printAsOperand(dbgs(), false); - dbgs() << '\n'; - }); + const Function &F = MF.getFunction(); // Maps optimizable regions (i.e., regions at minimum and register-limited // occupancy, or regions with spilling) to the target RP we would like to // reach. DenseMap<unsigned, GCNRPTarget> OptRegions; - const Function &F = MF.getFunction(); - unsigned DynamicVGPRBlockSize = - MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); - - std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); - const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); - const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); - const unsigned MaxSGPRsIncOcc = - ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); - const unsigned MaxVGPRsIncOcc = - ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize); - IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; - - // Collect optimizable regions. If there is spilling in any region we will - // just try to reduce spilling. Otherwise we will try to increase occupancy by - // one in the whole function. - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - GCNRegPressure &RP = DAG.Pressure[I]; - // We allow ArchVGPR or AGPR savings to count as savings of the other kind - // of VGPR only when trying to eliminate spilling. We cannot do this when - // trying to increase occupancy since VGPR class swaps only occur later in - // the register allocator i.e., the scheduler will not be able to reason - // about these savings and will not report an increase in the achievable - // occupancy, triggering rollbacks. - GCNRPTarget Target(MaxSGPRsNoSpill, MaxVGPRsNoSpill, MF, RP, - /*CombineVGPRSavings=*/true); - if (!Target.satisfied() && IncreaseOccupancy) { - // There is spilling in the region and we were so far trying to increase - // occupancy. Strop trying that and focus on reducing spilling. - IncreaseOccupancy = false; - OptRegions.clear(); - } else if (IncreaseOccupancy) { - // There is no spilling in the region, try to increase occupancy. - Target = GCNRPTarget(MaxSGPRsIncOcc, MaxVGPRsIncOcc, MF, RP, - /*CombineVGPRSavings=*/false); + unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); + unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); + auto ResetTargetRegions = [&]() { + OptRegions.clear(); + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + const GCNRegPressure &RP = DAG.Pressure[I]; + GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP); + if (!Target.satisfied()) + OptRegions.insert({I, Target}); } - if (!Target.satisfied()) - OptRegions.insert({I, Target}); - } - if (OptRegions.empty()) - return false; + }; -#ifndef NDEBUG - if (IncreaseOccupancy) { - REMAT_DEBUG(dbgs() << "Occupancy minimal (" << DAG.MinOccupancy - << ") in regions:\n"); + ResetTargetRegions(); + if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { + // In addition to register usage being above addressable limits, occupancy + // below the minimum is considered like "spilling" as well. + TargetOcc = std::nullopt; } else { - REMAT_DEBUG(dbgs() << "Spilling w.r.t. minimum target occupancy (" - << WavesPerEU.first << ") in regions:\n"); - } - for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { - if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) - REMAT_DEBUG(dbgs() << " [" << I << "] " << OptIt->getSecond() << '\n'); + // There is no spilling and room to improve occupancy; set up "increased + // occupancy targets" for all regions. + TargetOcc = DAG.MinOccupancy + 1; + unsigned VGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); + MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); + ResetTargetRegions(); } -#endif - - // When we are reducing spilling, the target is the minimum target number of - // waves/EU determined by the subtarget. In cases where either one of - // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current - // minimum region occupancy may be higher than the latter. - TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 - : std::max(DAG.MinOccupancy, WavesPerEU.first); + REMAT_DEBUG({ + dbgs() << "Analyzing "; + MF.getFunction().printAsOperand(dbgs(), false); + dbgs() << ": "; + if (OptRegions.empty()) { + dbgs() << "no objective to achieve, occupancy is maximal at " + << MFI.getMaxWavesPerEU(); + } else if (!TargetOcc) { + dbgs() << "reduce spilling (minimum target occupancy is " + << MFI.getMinWavesPerEU() << ')'; + } else { + dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to " + << TargetOcc; + } + dbgs() << '\n'; + for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { + if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) { + dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond() + << '\n'; + } + } + }); + if (OptRegions.empty()) + return false; // Accounts for a reduction in RP in an optimizable region. Returns whether we // estimate that we have identified enough rematerialization opportunities to @@ -1767,7 +1757,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask, bool &Progress) -> bool { GCNRPTarget &Target = OptIt->getSecond(); - if (!Target.isSaveBeneficial(Reg, DAG.MRI)) + if (!Target.isSaveBeneficial(Reg)) return false; Progress = true; Target.saveReg(Reg, Mask, DAG.MRI); @@ -1876,7 +1866,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { } } - if (IncreaseOccupancy) { + if (TargetOcc) { // We were trying to increase occupancy but failed, abort the stage. REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n"); Rematerializations.clear(); @@ -1979,7 +1969,9 @@ void PreRARematStage::rematerialize() { // All regions impacted by at least one rematerialization must be rescheduled. // Maximum pressure must also be recomputed for all regions where it changed // non-predictably and checked against the target occupancy. - AchievedOcc = TargetOcc; + unsigned DynamicVGPRBlockSize = + MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize(); + AchievedOcc = MFI.getMaxWavesPerEU(); for (auto &[I, OriginalRP] : ImpactedRegions) { bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second; RescheduleRegions[I] = !IsEmptyRegion; @@ -2003,9 +1995,8 @@ void PreRARematStage::rematerialize() { } } DAG.Pressure[I] = RP; - AchievedOcc = std::min( - AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>() - ->getDynamicVGPRBlockSize())); + AchievedOcc = + std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); } REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); } @@ -2035,7 +2026,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // which case we do not want to rollback either (the rescheduling was already // reverted in PreRARematStage::shouldRevertScheduling in such cases). unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); - if (!IncreaseOccupancy || MaxOcc >= TargetOcc) + if (!TargetOcc || MaxOcc >= *TargetOcc) return; REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 32139a9..790370f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -470,15 +470,12 @@ private: /// After successful stage initialization, indicates which regions should be /// rescheduled. BitVector RescheduleRegions; - /// Target occupancy the stage estimates is reachable through - /// rematerialization. Greater than or equal to the pre-stage min occupancy. - unsigned TargetOcc; + /// The target occupancy the stage is trying to achieve. Empty when the + /// objective is spilling reduction. + std::optional<unsigned> TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). /// Smaller than or equal to the target occupancy. unsigned AchievedOcc; - /// Whether the stage is attempting to increase occupancy in the abscence of - /// spilling. - bool IncreaseOccupancy; /// Returns whether remat can reduce spilling or increase function occupancy /// by 1 through rematerialization. If it can do one, collects instructions in diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f47ddf5..2a8385d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -390,7 +390,11 @@ public: /// the original value. bool zeroesHigh16BitsOfDest(unsigned Opcode) const; - bool supportsWGP() const { return getGeneration() >= GFX10; } + bool supportsWGP() const { + if (GFX1250Insts) + return false; + return getGeneration() >= GFX10; + } bool hasIntClamp() const { return HasIntClamp; @@ -1341,6 +1345,10 @@ public: bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } + bool setRegModeNeedsVNOPs() const { + return GFX1250Insts && getGeneration() == GFX12; + } + /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; } @@ -1573,6 +1581,12 @@ public: // extended VA to 57 bits. bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; } + // \returns true if the target needs to create a prolog for backward + // compatibility when preloading kernel arguments. + bool needsKernArgPreloadProlog() const { + return hasKernargPreload() && !GFX1250Insts; + } + /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { return AMDGPU::IsaInfo::getSGPRAllocGranule(this); @@ -1722,6 +1736,10 @@ public: /// unit requirement. unsigned getMaxNumVGPRs(const MachineFunction &MF) const; + bool supportsWave32() const { return getGeneration() >= GFX10; } + + bool supportsWave64() const { return !hasGFX1250Insts(); } + bool isWave32() const { return getWavefrontSize() == 32; } @@ -1785,11 +1803,11 @@ public: // \returns true if the subtarget has a hazard requiring an "s_nop 0" // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". - bool requiresNopBeforeDeallocVGPRs() const { - // Currently all targets that support the dealloc VGPRs message also require - // the nop. - return true; - } + bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; } + + // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on + // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER. + bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; } bool isDynamicVGPREnabled() const { return DynamicVGPR; } unsigned getDynamicVGPRBlockSize() const { @@ -1801,6 +1819,18 @@ public: // to the same register. return false; } + + // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything + // and surronded by S_WAIT_ALU(0xFFE3). + bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const { + return getGeneration() == GFX12; + } + + // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base + // read. + bool hasScratchBaseForwardingHazard() const { + return GFX1250Insts && getGeneration() == GFX12; + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ee8683a..aafbdc2 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -976,8 +976,10 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, unsigned Imm = MI->getOperand(OpNo).getImm(); const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) { - O << " /* DP ALU dpp only supports row_newbcast */"; + if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) && + AMDGPU::isDPALU_DPP(Desc, STI)) { + O << " /* DP ALU dpp only supports " + << (isGFX12(STI) ? "row_share" : "row_newbcast") << " */"; return; } if (Imm <= DppCtrl::QUAD_PERM_LAST) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index f358084..61f6732 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -389,6 +389,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) && // Matrix B format operand reuses op_sel_hi. !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) && + // Matrix B scale operand reuses op_sel_hi. + !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_scale) && // Matrix B reuse operand reuses op_sel_hi. !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) { Encoding |= getImplicitOpSelHiEncoding(Opcode); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 68302f0..e20581d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/TargetParser/TargetParser.h" @@ -563,11 +562,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PrintField(KD.compute_pgm_rsrc3, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split"); - if (IVersion.Major >= 10) { + if (AMDGPU::supportsWGP(STI)) PrintField(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ".amdhsa_workgroup_processor_mode"); + if (IVersion.Major >= 10) { PrintField(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, @@ -885,7 +885,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, if (!SymbolELF->isBindingSet()) SymbolELF->setBinding(ELF::STB_GLOBAL); - if (SymbolELF->declareCommon(Size, Alignment, true)) { + if (SymbolELF->declareCommon(Size, Alignment)) { report_fatal_error("Symbol: " + Symbol->getName() + " redeclared as different type"); } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 2d0102f..7c01903 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -197,7 +197,7 @@ enum ClassFlags : unsigned { namespace AMDGPU { enum OperandType : unsigned { - /// Operands with register or 32-bit immediate + /// Operands with register, 32-bit, or 64-bit immediate OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET, OPERAND_REG_IMM_INT64, OPERAND_REG_IMM_INT16, @@ -407,7 +407,7 @@ enum CPol { SCAL = 1 << 11, // Scale offset bit - ALL = TH | SCOPE, + ALL = TH | SCOPE | NV, // Helper bits TH_TYPE_LOAD = 1 << 7, // TH_LOAD policy @@ -440,6 +440,7 @@ enum Id { // Message ID, width(4) [3:0]. ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10 ID_GS_ALLOC_REQ = 9, // added in GFX9 ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11 + ID_SAVEWAVE_HAS_TDM = 10, // added in GFX1250 ID_GET_DDID = 11, // added in GFX10, removed in GFX11 ID_SYSMSG = 15, @@ -513,6 +514,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_HW_ID2 = 24, ID_POPS_PACKER = 25, ID_PERF_SNAPSHOT_DATA_gfx11 = 27, + ID_IB_STS2 = 28, ID_SHADER_CYCLES = 29, ID_SHADER_CYCLES_HI = 30, ID_DVGPR_ALLOC_LO = 31, diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f018f77..dce4e6f 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -460,7 +460,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, // List of clobbering instructions. SmallVector<MachineInstr*, 8> Clobbers; // List of instructions marked for deletion. - SmallSet<MachineInstr*, 8> MergedInstrs; + SmallPtrSet<MachineInstr *, 8> MergedInstrs; bool Changed = false; @@ -808,7 +808,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { bool AllAGPRUses = true; SetVector<const MachineInstr *> worklist; - SmallSet<const MachineInstr *, 4> Visited; + SmallPtrSet<const MachineInstr *, 4> Visited; SetVector<MachineInstr *> PHIOperands; worklist.insert(&MI); Visited.insert(&MI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5b327fb..561019b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments( if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + + // This assumes the registers are allocated by CCInfo in ascending order + // with no gaps. + Info->setNumWaveDispatchSGPRs( + CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters())); + Info->setNumWaveDispatchVGPRs( + CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters())); + } else if (Info->getNumKernargPreloadedSGPRs()) { + Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs()); } SmallVector<SDValue, 16> Chains; @@ -6612,7 +6621,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, unsigned SplitSize = 32; if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) && ST->hasDPALU_DPP() && - AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3))) + AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3))) SplitSize = 64; auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, @@ -10816,6 +10825,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); + case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: { // these two intrinsics have two operands: barrier pointer and member count SDValue Chain = Op->getOperand(0); @@ -10823,6 +10833,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue BarOp = Op->getOperand(2); SDValue CntOp = Op->getOperand(3); SDValue M0Val; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init + ? AMDGPU::S_BARRIER_INIT_M0 + : AMDGPU::S_BARRIER_SIGNAL_M0; // extract the BarrierID from bits 4-9 of BarOp SDValue BarID; BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, @@ -10846,8 +10859,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); - auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL, - Op->getVTList(), Ops); + auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } + case Intrinsic::amdgcn_s_barrier_join: { + // these three intrinsics have one operand: barrier pointer + SDValue Chain = Op->getOperand(0); + SmallVector<SDValue, 2> Ops; + SDValue BarOp = Op->getOperand(2); + unsigned Opc; + + if (isa<ConstantSDNode>(BarOp)) { + uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue(); + Opc = AMDGPU::S_BARRIER_JOIN_IMM; + + // extract the BarrierID from bits 4-9 of the immediate + unsigned BarID = (BarVal >> 4) & 0x3F; + SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); + Ops.push_back(K); + Ops.push_back(Chain); + } else { + Opc = AMDGPU::S_BARRIER_JOIN_M0; + + // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0] + SDValue M0Val; + M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, + DAG.getShiftAmountConstant(4, MVT::i32, DL)); + M0Val = + SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val, + DAG.getTargetConstant(0x3F, DL, MVT::i32)), + 0); + Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); + } + + auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); return SDValue(NewMI, 0); } case Intrinsic::amdgcn_s_prefetch_data: { @@ -11495,9 +11540,22 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { return FastLowered; SDLoc SL(Op); + EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); + SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS); + SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS); + + if (VT == MVT::bf16) { + SDValue ExtDiv = + DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags()); + return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv, + DAG.getTargetConstant(0, SL, MVT::i32)); + } + + assert(VT == MVT::f16); + // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d @@ -11514,9 +11572,6 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { // We will use ISD::FMA on targets that don't support ISD::FMAD. unsigned FMADOpCode = isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA; - - SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS); - SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS); SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt); SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags()); @@ -15684,7 +15739,7 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); EVT VT = N->getValueType(0); - if (VT != MVT::f16 || !Subtarget->has16BitInsts()) + if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts()) return SDValue(); SDValue LHS = N->getOperand(0); @@ -16849,6 +16904,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { + // Check if we cannot determine the bit size of the given value type. This + // can happen, for example, in this situation where we have an empty struct + // (size 0): `call void asm "", "v"({} poison)`- + if (VT == MVT::Other) + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); const unsigned BitWidth = VT.getSizeInBits(); switch (Constraint[0]) { default: @@ -16897,13 +16957,26 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, } break; } - // We actually support i128, i16 and f16 as inline parameters - // even if they are not reported as legal - if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || - VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) - return std::pair(0U, RC); + } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) { + const unsigned BitWidth = VT.getSizeInBits(); + switch (BitWidth) { + case 16: + RC = &AMDGPU::AV_32RegClass; + break; + default: + RC = TRI->getVectorSuperClassForBitWidth(BitWidth); + if (!RC) + return std::pair(0U, nullptr); + break; + } } + // We actually support i128, i16 and f16 as inline parameters + // even if they are not reported as legal + if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || + VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) + return std::pair(0U, RC); + auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint); if (Kind != '\0') { if (Kind == 'v') { @@ -16916,7 +16989,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, if (RC) { if (NumRegs > 1) { - if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 > RC->getNumRegs()) + if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs()) return std::pair(0U, nullptr); uint32_t Width = NumRegs * 32; @@ -16988,6 +17061,9 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { case 'a': return C_RegisterClass; } + } else if (Constraint.size() == 2) { + if (Constraint == "VA") + return C_RegisterClass; } if (isImmConstraint(Constraint)) { return C_Other; @@ -17727,23 +17803,9 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) { /// Return if a flat address space atomicrmw can access private memory. static bool flatInstrMayAccessPrivate(const Instruction *I) { - const MDNode *NoaliasAddrSpaceMD = - I->getMetadata(LLVMContext::MD_noalias_addrspace); - if (!NoaliasAddrSpaceMD) - return true; - - for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E; - ++I) { - auto *Low = mdconst::extract<ConstantInt>( - NoaliasAddrSpaceMD->getOperand(2 * I + 0)); - if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) { - auto *High = mdconst::extract<ConstantInt>( - NoaliasAddrSpaceMD->getOperand(2 * I + 1)); - return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS); - } - } - - return true; + const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace); + return !MD || + !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS); } TargetLowering::AtomicExpansionKind diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 4b48fc4..343e455 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2341,6 +2341,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, case AMDGPU::S_MEMREALTIME: case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: + case AMDGPU::S_BARRIER_LEAVE: case AMDGPU::S_GET_BARRIER_STATE_M0: case AMDGPU::S_GET_BARRIER_STATE_IMM: ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 89d9b0d..50964a9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -473,6 +473,7 @@ class VIMAGE_VSAMPLE_Common <bits<8> op> : Enc96 { let Inst{4} = r128; let Inst{5} = d16; let Inst{6} = a16; + let Inst{7} = cpol{5}; // nv let Inst{21-14} = op; let Inst{25-22} = dmask; let Inst{39-32} = vdata; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 19e6bcf..cc4bee0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2616,9 +2616,9 @@ std::pair<MachineInstr*, MachineInstr*> SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); - if (ST.hasMovB64() && + if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) && AMDGPU::isLegalDPALU_DPPControl( - getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { + ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); return std::pair(&MI, nullptr); } @@ -2905,7 +2905,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const { - assert(RS && "RegScavenger required for long branching"); assert(MBB.empty() && "new block should be inserted for expanding unconditional branch"); assert(MBB.pred_size() == 1); @@ -4241,6 +4240,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || MI.getOpcode() == AMDGPU::S_SETREG_B32 || MI.getOpcode() == AMDGPU::S_SETPRIO || + MI.getOpcode() == AMDGPU::S_SETPRIO_INC_WG || changesVGPRIndexingMode(MI); } @@ -4267,12 +4267,15 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { if (MI.memoperands_empty()) return true; - // TODO (?): Does this need to be taught how to read noalias.addrspace ? - // See if any memory operand specifies an address space that involves scratch. return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { unsigned AS = Memop->getAddrSpace(); - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + if (AS == AMDGPUAS::FLAT_ADDRESS) { + const MDNode *MD = Memop->getAAInfo().NoAliasAddrSpace; + return !MD || !AMDGPU::hasValueInRangeLikeMetadata( + *MD, AMDGPUAS::PRIVATE_ADDRESS); + } + return AS == AMDGPUAS::PRIVATE_ADDRESS; }); } @@ -5433,7 +5436,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && - !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) { + !AMDGPU::isLegalDPALU_DPPControl(ST, DC) && + AMDGPU::isDPALU_DPP(Desc, ST)) { ErrInfo = "Invalid dpp_ctrl value: " "DP ALU dpp only support row_newbcast"; return false; @@ -9225,7 +9229,7 @@ bool SIInstrInfo::isHighLatencyDef(int Opc) const { (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); } -unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, +Register SIInstrInfo::isStackAccess(const MachineInstr &MI, int &FrameIndex) const { const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); if (!Addr || !Addr->isFI()) @@ -9238,7 +9242,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); } -unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, +Register SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const { const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); assert(Addr && Addr->isFI()); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 6b9403f..12ffae7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -996,6 +996,11 @@ public: bool isBarrier(unsigned Opcode) const { return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT || + Opcode == AMDGPU::S_BARRIER_INIT_M0 || + Opcode == AMDGPU::S_BARRIER_INIT_IMM || + Opcode == AMDGPU::S_BARRIER_JOIN_IMM || + Opcode == AMDGPU::S_BARRIER_LEAVE || + Opcode == AMDGPU::S_BARRIER_LEAVE_IMM || Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER; } @@ -1051,7 +1056,7 @@ public: } } - bool isWaitcnt(unsigned Opcode) const { + static bool isWaitcnt(unsigned Opcode) { switch (getNonSoftWaitcntOpcode(Opcode)) { case AMDGPU::S_WAITCNT: case AMDGPU::S_WAITCNT_VSCNT: @@ -1402,8 +1407,8 @@ public: return get(pseudoToMCOpcode(Opcode)); } - unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const; - unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const; + Register isStackAccess(const MachineInstr &MI, int &FrameIndex) const; + Register isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const; Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index c552f1a..c425d97 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1954,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { !eq(VT, v2f16) : VCSrc_v2f16, !eq(VT, v2bf16) : VCSrc_v2bf16, !eq(VT, f32) : VCSrc_f32, + !eq(VT, f64) : VCSrc_f64, !eq(VT, v2i32) : VCSrc_v2b32, 1 : VCSrc_b32); } @@ -2707,7 +2708,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { isModifierType<Src2VT>.ret, HasOMod); field bit HasNeg = HasModifiers; - field bit HasMatrixReuse = 0; field bit HasMatrixFMT = 0; field bit HasMatrixScale = 0; field bit HasMatrixReuse = 0; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b49c5a9..e204d6b 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -87,6 +87,8 @@ enum InstClassEnum { GLOBAL_STORE_SADDR, FLAT_LOAD, FLAT_STORE, + FLAT_LOAD_SADDR, + FLAT_STORE_SADDR, GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of GLOBAL_STORE // any CombineInfo, they are only ever returned by // getCommonInstClass. @@ -354,6 +356,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORD_SADDR: case AMDGPU::FLAT_LOAD_DWORD: case AMDGPU::FLAT_STORE_DWORD: + case AMDGPU::FLAT_LOAD_DWORD_SADDR: + case AMDGPU::FLAT_STORE_DWORD_SADDR: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: @@ -367,6 +371,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: case AMDGPU::FLAT_LOAD_DWORDX2: case AMDGPU::FLAT_STORE_DWORDX2: + case AMDGPU::FLAT_LOAD_DWORDX2_SADDR: + case AMDGPU::FLAT_STORE_DWORDX2_SADDR: return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: @@ -380,6 +386,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::FLAT_LOAD_DWORDX3: case AMDGPU::FLAT_STORE_DWORDX3: + case AMDGPU::FLAT_LOAD_DWORDX3_SADDR: + case AMDGPU::FLAT_STORE_DWORDX3_SADDR: return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: @@ -393,6 +401,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: case AMDGPU::FLAT_LOAD_DWORDX4: case AMDGPU::FLAT_STORE_DWORDX4: + case AMDGPU::FLAT_LOAD_DWORDX4_SADDR: + case AMDGPU::FLAT_STORE_DWORDX4_SADDR: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: @@ -575,6 +585,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: return GLOBAL_STORE_SADDR; + case AMDGPU::FLAT_LOAD_DWORD_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX2_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX3_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX4_SADDR: + return FLAT_LOAD_SADDR; + case AMDGPU::FLAT_STORE_DWORD_SADDR: + case AMDGPU::FLAT_STORE_DWORDX2_SADDR: + case AMDGPU::FLAT_STORE_DWORDX3_SADDR: + case AMDGPU::FLAT_STORE_DWORDX4_SADDR: + return FLAT_STORE_SADDR; } } @@ -661,6 +681,16 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: return AMDGPU::GLOBAL_STORE_DWORD_SADDR; + case AMDGPU::FLAT_LOAD_DWORD_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX2_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX3_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX4_SADDR: + return AMDGPU::FLAT_LOAD_DWORD_SADDR; + case AMDGPU::FLAT_STORE_DWORD_SADDR: + case AMDGPU::FLAT_STORE_DWORDX2_SADDR: + case AMDGPU::FLAT_STORE_DWORDX3_SADDR: + case AMDGPU::FLAT_STORE_DWORDX4_SADDR: + return AMDGPU::FLAT_STORE_DWORD_SADDR; } } @@ -776,6 +806,14 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: + case AMDGPU::FLAT_LOAD_DWORD_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX2_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX3_SADDR: + case AMDGPU::FLAT_LOAD_DWORDX4_SADDR: + case AMDGPU::FLAT_STORE_DWORD_SADDR: + case AMDGPU::FLAT_STORE_DWORDX2_SADDR: + case AMDGPU::FLAT_STORE_DWORDX3_SADDR: + case AMDGPU::FLAT_STORE_DWORDX4_SADDR: Result.SAddr = true; [[fallthrough]]; case AMDGPU::GLOBAL_LOAD_DWORD: @@ -1875,6 +1913,28 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, case 4: return AMDGPU::FLAT_STORE_DWORDX4; } + case FLAT_LOAD_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::FLAT_LOAD_DWORDX2_SADDR; + case 3: + return AMDGPU::FLAT_LOAD_DWORDX3_SADDR; + case 4: + return AMDGPU::FLAT_LOAD_DWORDX4_SADDR; + } + case FLAT_STORE_SADDR: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::FLAT_STORE_DWORDX2_SADDR; + case 3: + return AMDGPU::FLAT_STORE_DWORDX3_SADDR; + case 4: + return AMDGPU::FLAT_STORE_DWORDX4_SADDR; + } case MIMG: assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && "No overlaps"); @@ -2508,12 +2568,14 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case FLAT_LOAD: + case FLAT_LOAD_SADDR: case GLOBAL_LOAD: case GLOBAL_LOAD_SADDR: NewMI = mergeFlatLoadPair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 4; break; case FLAT_STORE: + case FLAT_STORE_SADDR: case GLOBAL_STORE: case GLOBAL_STORE_SADDR: NewMI = mergeFlatStorePair(CI, Paired, Where->I); diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f8878f3..e97536d 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -57,6 +57,7 @@ #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -76,10 +77,11 @@ private: LiveIntervals *LIS = nullptr; LiveVariables *LV = nullptr; MachineDominatorTree *MDT = nullptr; + MachinePostDominatorTree *PDT = nullptr; MachineRegisterInfo *MRI = nullptr; SetVector<MachineInstr*> LoweredEndCf; DenseSet<Register> LoweredIf; - SmallSet<MachineBasicBlock *, 4> KillBlocks; + SmallPtrSet<MachineBasicBlock *, 4> KillBlocks; SmallSet<Register, 8> RecomputeRegs; const TargetRegisterClass *BoolRC = nullptr; @@ -138,8 +140,8 @@ private: public: SILowerControlFlow(LiveIntervals *LIS, LiveVariables *LV, - MachineDominatorTree *MDT) - : LIS(LIS), LV(LV), MDT(MDT) {} + MachineDominatorTree *MDT, MachinePostDominatorTree *PDT) + : LIS(LIS), LV(LV), MDT(MDT), PDT(PDT) {} bool run(MachineFunction &MF); }; @@ -159,6 +161,7 @@ public: AU.addUsedIfAvailable<LiveIntervalsWrapperPass>(); // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved<MachineDominatorTreeWrapperPass>(); + AU.addPreserved<MachinePostDominatorTreeWrapperPass>(); AU.addPreserved<SlotIndexesWrapperPass>(); AU.addPreserved<LiveIntervalsWrapperPass>(); AU.addPreserved<LiveVariablesWrapperPass>(); @@ -457,7 +460,7 @@ MachineBasicBlock::iterator SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - SmallSet<const MachineBasicBlock *, 4> Visited; + SmallPtrSet<const MachineBasicBlock *, 4> Visited; MachineBasicBlock *B = &MBB; do { if (!Visited.insert(B).second) @@ -506,13 +509,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock *SplitBB = &MBB; if (NeedBlockSplit) { SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); - if (MDT && SplitBB != &MBB) { - MachineDomTreeNode *MBBNode = (*MDT)[&MBB]; - SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(), - MBBNode->end()); - MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB); - for (MachineDomTreeNode *Child : Children) - MDT->changeImmediateDominator(Child, SplitBBNode); + if (SplitBB != &MBB && (MDT || PDT)) { + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 16> DTUpdates; + for (MachineBasicBlock *Succ : SplitBB->successors()) { + DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); + DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); + } + DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); + if (MDT) + MDT->applyUpdates(DTUpdates); + if (PDT) + PDT->applyUpdates(DTUpdates); } Opcode = OrTermrOpc; InsPt = MI; @@ -727,26 +735,27 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { MachineBasicBlock *Succ = *MBB.succ_begin(); MachineBasicBlock *FallThrough = nullptr; + using DomTreeT = DomTreeBase<MachineBasicBlock>; + SmallVector<DomTreeT::UpdateType, 8> DTUpdates; + while (!MBB.predecessors().empty()) { MachineBasicBlock *P = *MBB.pred_begin(); if (P->getFallThrough(false) == &MBB) FallThrough = P; P->ReplaceUsesOfBlockWith(&MBB, Succ); + DTUpdates.push_back({DomTreeT::Insert, P, Succ}); + DTUpdates.push_back({DomTreeT::Delete, P, &MBB}); } MBB.removeSuccessor(Succ); if (LIS) { for (auto &I : MBB.instrs()) LIS->RemoveMachineInstrFromMaps(I); } - if (MDT) { - // If Succ, the single successor of MBB, is dominated by MBB, MDT needs - // updating by changing Succ's idom to the one of MBB; otherwise, MBB must - // be a leaf node in MDT and could be erased directly. - if (MDT->dominates(&MBB, Succ)) - MDT->changeImmediateDominator(MDT->getNode(Succ), - MDT->getNode(&MBB)->getIDom()); - MDT->eraseNode(&MBB); - } + if (MDT) + MDT->applyUpdates(DTUpdates); + if (PDT) + PDT->applyUpdates(DTUpdates); + MBB.clear(); MBB.eraseFromParent(); if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) { @@ -875,7 +884,11 @@ bool SILowerControlFlowLegacy::runOnMachineFunction(MachineFunction &MF) { LiveVariables *LV = LVWrapper ? &LVWrapper->getLV() : nullptr; auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>(); MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; - return SILowerControlFlow(LIS, LV, MDT).run(MF); + auto *PDTWrapper = + getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>(); + MachinePostDominatorTree *PDT = + PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr; + return SILowerControlFlow(LIS, LV, MDT, PDT).run(MF); } PreservedAnalyses @@ -885,13 +898,16 @@ SILowerControlFlowPass::run(MachineFunction &MF, LiveVariables *LV = MFAM.getCachedResult<LiveVariablesAnalysis>(MF); MachineDominatorTree *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF); + MachinePostDominatorTree *PDT = + MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF); - bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF); + bool Changed = SILowerControlFlow(LIS, LV, MDT, PDT).run(MF); if (!Changed) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); PA.preserve<MachineDominatorTreeAnalysis>(); + PA.preserve<MachinePostDominatorTreeAnalysis>(); PA.preserve<SlotIndexesAnalysis>(); PA.preserve<LiveIntervalsAnalysis>(); PA.preserve<LiveVariablesAnalysis>(); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 9509199..09b737c 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -209,10 +209,13 @@ void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { // So set the save points for those. // Use the points found by shrink-wrapping, if any. - if (MFI.getSavePoint()) { - SaveBlocks.push_back(MFI.getSavePoint()); - assert(MFI.getRestorePoint() && "Both restore and save must be set"); - MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); + if (!MFI.getSavePoints().empty()) { + assert(MFI.getSavePoints().size() == 1 && + "Multiple save points not yet supported!"); + SaveBlocks.push_back(MFI.getSavePoints().front()); + assert(MFI.getRestorePoints().size() == 1 && + "Multiple restore points not yet supported!"); + MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front(); // If RestoreBlock does not have any successor and is not a return block // then the end point is unreachable and we do not need to insert any // epilogue. diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 9a1448f..8a11203 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -33,7 +33,7 @@ using namespace llvm; // optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases // where it is better to produce the VGPR form (e.g. if there are VGPR users // of the MFMA result). -cl::opt<bool> MFMAVGPRForm( +static cl::opt<bool> MFMAVGPRForm( "amdgpu-mfma-vgpr-form", cl::Hidden, cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), @@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()), HasSpilledSGPRs(MFI.hasSpilledSGPRs()), HasSpilledVGPRs(MFI.hasSpilledVGPRs()), + NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()), + NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), Occupancy(MFI.getOccupancy()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), @@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( WaveLimiter = YamlMFI.WaveLimiter; HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs; HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs; + NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs; + NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs; BytesInStackArgArea = YamlMFI.BytesInStackArgArea; ReturnsVoid = YamlMFI.ReturnsVoid; IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 08b0206..ca8f803 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool WaveLimiter = false; bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; + uint16_t NumWaveDispatchSGPRs = 0; + uint16_t NumWaveDispatchVGPRs = 0; uint32_t HighBitsOf32BitAddress = 0; // TODO: 10 may be a better default since it's the maximum. @@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false); YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false); + YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false); + YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, @@ -465,6 +469,9 @@ private: unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; + unsigned NumWaveDispatchSGPRs = 0; + unsigned NumWaveDispatchVGPRs = 0; + bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; bool HasNonSpillStackObjects = false; @@ -991,6 +998,14 @@ public: return UserSGPRInfo.getNumKernargPreloadSGPRs(); } + unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; } + + void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; } + + unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; } + + void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; } + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index 205a45a..38d9a4b 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -130,6 +130,9 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) { if (VirtReg.isPhysical()) continue; + if (!VirtReg.isValid()) + continue; + if (!VRM->hasPhys(VirtReg)) continue; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 81655f5..0293d40 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1166,7 +1166,8 @@ class RegOrImmOperand <RegisterClass RegClass, string OperandTypeName> } //===----------------------------------------------------------------------===// -// SSrc_* Operands with an SGPR or a 32-bit immediate +// SSrc_* Operands with an SGPR, a 32-bit immediate, or 64-bit immediate +// if supported by target. //===----------------------------------------------------------------------===// class SrcRegOrImm9<RegisterClass regClass, string operandType> diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 431d73b..a003a46 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -484,6 +484,24 @@ def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (o let isConvergent = 1; } +def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + +def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins), + "", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + } // End Uses = [M0] def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs), @@ -501,6 +519,12 @@ def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (out let isConvergent = 1; } +def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs), + (ins SplitBarrier:$src0), "$src0", []>{ + let SchedRW = [WriteBarrier]; + let isConvergent = 1; +} + } // End has_sdst = 0 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst), @@ -1588,6 +1612,17 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm let isConvergent = 1; } +def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> { + let SchedRW = [WriteBarrier]; + let simm16 = 0; + let fixed_imm = 1; + let isConvergent = 1; + let Defs = [SCC]; +} + +def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave", + (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>; + def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let SubtargetPredicate = isGFX8Plus; let simm16 = 0; @@ -1630,7 +1665,9 @@ def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; -def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; +def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> { + let SubtargetPredicate = isNotGFX1250Plus; +} // On SI the documentation says sleep for approximately 64 * low 2 // bits, consistent with the reported maximum of 448. On VI the @@ -2144,9 +2181,13 @@ defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>; defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>; defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>; defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>; +defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>; defm S_BARRIER_SIGNAL_IMM : SOP1_IMM_Real_gfx12<0x04e>; defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>; defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>; +defm S_BARRIER_INIT_IMM : SOP1_IMM_Real_gfx12<0x051>; +defm S_BARRIER_JOIN_IMM : SOP1_IMM_Real_gfx12<0x052>; defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>; defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; @@ -2639,6 +2680,7 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> { } defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>; +defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>; defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>; defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>; defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 3d9455f..c740b5e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -106,7 +106,7 @@ static constexpr CustomOperand MsgOperands[] = { {{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10}, {{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus}, {{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus}, - {{""}}, + {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250}, {{"MSG_SYSMSG"}, ID_SYSMSG}, {{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus}, {{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus}, @@ -195,7 +195,7 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, {{""}}, {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11}, - {{""}}, + {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250}, {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11}, {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus}, {{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus}, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1e3e9a2..6e4e087 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -1160,17 +1161,28 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) { return 65536; if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) return 163840; + if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) + return 327680; return 0; } unsigned getEUsPerCU(const MCSubtargetInfo *STI) { // "Per CU" really means "per whatever functional block the waves of a - // workgroup must share". For gfx10 in CU mode this is the CU, which contains + // workgroup must share". + + // GFX12.5 only supports CU mode, which contains four SIMDs. + if (isGFX1250(*STI)) { + assert(STI->getFeatureBits().test(FeatureCuMode)); + return 4; + } + + // For gfx10 in CU mode the functional block is the CU, which contains // two SIMDs. if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode)) return 2; - // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains - // two CUs, so a total of four SIMDs. + + // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP + // contains two CUs, so a total of four SIMDs. return 4; } @@ -1666,6 +1678,29 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size) { return Vals; } +bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) { + assert((MD.getNumOperands() % 2 == 0) && "invalid number of operands!"); + for (unsigned I = 0, E = MD.getNumOperands() / 2; I != E; ++I) { + auto Low = + mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 0))->getValue(); + auto High = + mdconst::extract<ConstantInt>(MD.getOperand(2 * I + 1))->getValue(); + // There are two types of [A; B) ranges: + // A < B, e.g. [4; 5) which is a range that only includes 4. + // A > B, e.g. [5; 4) which is a range that wraps around and includes + // everything except 4. + if (Low.ult(High)) { + if (Low.ule(Val) && High.ugt(Val)) + return true; + } else { + if (Low.uge(Val) && High.ult(Val)) + return true; + } + } + + return false; +} + unsigned getVmcntBitMask(const IsaVersion &Version) { return (1 << (getVmcntBitWidthLo(Version.Major) + getVmcntBitWidthHi(Version.Major))) - @@ -2406,7 +2441,11 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) { return 0; } -unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; } +unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { + if (isGFX1250(STI)) + return 32; + return 16; +} bool isSI(const MCSubtargetInfo &STI) { return STI.hasFeature(AMDGPU::FeatureSouthernIslands); @@ -2478,6 +2517,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts]; } +bool supportsWGP(const MCSubtargetInfo &STI) { + if (isGFX1250(STI)) + return false; + return isGFX10Plus(STI); +} + bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); } bool isNotGFX10Plus(const MCSubtargetInfo &STI) { @@ -3309,13 +3354,39 @@ bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) { return false; } -bool isDPALU_DPP(const MCInstrDesc &OpDesc) { +bool isDPALU_DPP32BitOpc(unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MUL_LO_U32_e64: + case AMDGPU::V_MUL_LO_U32_e64_dpp: + case AMDGPU::V_MUL_LO_U32_e64_dpp_gfx1250: + case AMDGPU::V_MUL_HI_U32_e64: + case AMDGPU::V_MUL_HI_U32_e64_dpp: + case AMDGPU::V_MUL_HI_U32_e64_dpp_gfx1250: + case AMDGPU::V_MUL_HI_I32_e64: + case AMDGPU::V_MUL_HI_I32_e64_dpp: + case AMDGPU::V_MUL_HI_I32_e64_dpp_gfx1250: + case AMDGPU::V_MAD_U32_e64: + case AMDGPU::V_MAD_U32_e64_dpp: + case AMDGPU::V_MAD_U32_e64_dpp_gfx1250: + return true; + default: + return false; + } +} + +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) { + if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP)) + return false; + + if (isDPALU_DPP32BitOpc(OpDesc.getOpcode())) + return ST.hasFeature(AMDGPU::FeatureGFX1250Insts); + return hasAny64BitVGPROperands(OpDesc); } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { - // Currently this is 128 for all subtargets - return 128; + return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256 + : 128; } bool isPackedFP32Inst(unsigned Opc) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1bcd36c..70dfb63 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -35,6 +35,7 @@ class MCInstrInfo; class MCRegisterClass; class MCRegisterInfo; class MCSubtargetInfo; +class MDNode; class StringRef; class Triple; class raw_ostream; @@ -1064,6 +1065,9 @@ SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name, std::optional<SmallVector<unsigned>> getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size); +/// Checks if \p Val is inside \p MD, a !range-like metadata. +bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val); + /// Represents the counter values to wait for in an s_waitcnt instruction. /// /// Large values (including the maximum possible integer) can be used to @@ -1549,6 +1553,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI); bool isGFX12(const MCSubtargetInfo &STI); bool isGFX12Plus(const MCSubtargetInfo &STI); bool isGFX1250(const MCSubtargetInfo &STI); +bool supportsWGP(const MCSubtargetInfo &STI); bool isNotGFX12Plus(const MCSubtargetInfo &STI); bool isNotGFX11Plus(const MCSubtargetInfo &STI); bool isGCN3Encoding(const MCSubtargetInfo &STI); @@ -1750,15 +1755,22 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST); bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); LLVM_READNONE -inline bool isLegalDPALU_DPPControl(unsigned DC) { - return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; +inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) { + if (isGFX12(ST)) + return DC >= DPP::ROW_SHARE_FIRST && DC <= DPP::ROW_SHARE_LAST; + if (isGFX90A(ST)) + return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; + return false; } /// \returns true if an instruction may have a 64-bit VGPR operand. bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc); +/// \returns true if an instruction is a DP ALU DPP without any 64-bit operands. +bool isDPALU_DPP32BitOpc(unsigned Opc); + /// \returns true if an instruction is a DP ALU DPP. -bool isDPALU_DPP(const MCInstrDesc &OpDesc); +bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b128207..11c7275 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -706,7 +706,6 @@ def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>; let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in { - // FIXME: This differs from downstream due to changes that haven't been upstreamed yet. let SubtargetPredicate = isGFX12PlusNot12_50 in defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>; let SubtargetPredicate = isGFX125xOnly in @@ -731,7 +730,6 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe >; let OtherPredicates = [HasFP8ConversionInsts] in { - // FIXME: This differs from downstream due to changes that haven't been upstreamed yet. let SubtargetPredicate = isGFX12PlusNot12_50 in def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>; let SubtargetPredicate = isGFX125xOnly in { @@ -740,7 +738,6 @@ let OtherPredicates = [HasFP8ConversionInsts] in { def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel), (V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>; } - // FIXME: This differs from downstream due to changes that haven't been upstreamed yet. let SubtargetPredicate = isGFX12Plus in def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>; } @@ -1058,11 +1055,6 @@ multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> : multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> : VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>; -multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op, - string opName, string asmName> : - VOP1_Real_e32_with_name<Gen, op, opName, asmName>, - VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>; - multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250< bits<9> op, string asmName = !tolower(NAME), string opName = NAME> { defm opName#"_t16" : diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index f4b6af6..329d003 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -2084,6 +2084,9 @@ multiclass VOP3_Realtriple_gfx11_gfx12<bits<10> op> : multiclass VOP3_Real_Base_gfx11_gfx12<bits<10> op> : VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Gen, op>; +multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250<bits<10> op> : + VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Not12_50Gen, op>; + multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName, string asmName> : VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>, @@ -2211,9 +2214,9 @@ defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>; defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>; defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>; defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32b>; -defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12<0x32c>; -defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12<0x32d>; -defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12<0x32e>; +defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>; +defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>; +defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>; defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>; defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">; defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">; @@ -2242,6 +2245,10 @@ let AssemblerPredicate = isGFX11Plus in { } // These instructions differ from GFX12 variant by supporting DPP: +defm V_MUL_LO_U32 : VOP3Only_Realtriple_gfx1250<0x32c>; +defm V_MUL_HI_U32 : VOP3Only_Realtriple_gfx1250<0x32d>; +defm V_MUL_HI_I32 : VOP3Only_Realtriple_gfx1250<0x32e>; + defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>; defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>; defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>; |