diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 22 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp | 78 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 238 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/MIMGInstructions.td | 185 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 37 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SOPInstructions.td | 7 |
12 files changed, 400 insertions, 192 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ce2b4a5..cd8b249 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -562,9 +562,13 @@ public: void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); extern char &AMDGPURewriteAGPRCopyMFMALegacyID; +void initializeAMDGPUUniformIntrinsicCombineLegacyPass(PassRegistry &); +extern char &AMDGPUUniformIntrinsicCombineLegacyPassID; +FunctionPass *createAMDGPUUniformIntrinsicCombineLegacyPass(); + struct AMDGPUUniformIntrinsicCombinePass : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 0eb00cb..529da8d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -50,6 +50,7 @@ const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr); struct ImageDimIntrinsicInfo { unsigned Intr; unsigned BaseOpcode; + unsigned AtomicNoRetBaseOpcode; MIMGDim Dim; uint8_t NumOffsetArgs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 97c2c9c..9ce1224 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2006,19 +2006,27 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned IntrOpcode = Intr->BaseOpcode; + + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) { + Register ResultDef = MI.getOperand(0).getReg(); + if (MRI->use_nodbg_empty(ResultDef)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; + } const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; - Register VDataIn, VDataOut; + Register VDataIn = AMDGPU::NoRegister; + Register VDataOut = AMDGPU::NoRegister; LLT VDataTy; int NumVDataDwords = -1; bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || @@ -2049,7 +2057,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { - VDataOut = MI.getOperand(0).getReg(); + if (!BaseOpcode->NoReturn) + VDataOut = MI.getOperand(0).getReg(); VDataIn = MI.getOperand(2).getReg(); LLT Ty = MRI->getType(VDataIn); @@ -2099,8 +2108,9 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index a6074ea..bf6f1a9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -30,7 +30,6 @@ MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass( MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass()) MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this)) MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this)) -MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef MODULE_PASS #ifndef MODULE_PASS_WITH_PARAMS @@ -69,6 +68,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this))) +FUNCTION_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass()) #undef FUNCTION_PASS #ifndef FUNCTION_ANALYSIS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6214f4d..75a94ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -619,6 +619,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR); initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR); initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR); + initializeAMDGPUUniformIntrinsicCombineLegacyPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -887,9 +888,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); - - if (EnableUniformIntrinsicCombine) - PM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerPeepholeEPCallback( @@ -900,6 +898,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify) FPM.addPass(AMDGPUSimplifyLibCallsPass()); + + if (EnableUniformIntrinsicCombine) + FPM.addPass(AMDGPUUniformIntrinsicCombinePass()); }); PB.registerCGSCCOptimizerLateEPCallback( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 50c78d8..65e6ed9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -16,12 +16,6 @@ /// uniformity. And every instruction that's downstream and cares about dynamic /// uniformity must be convergent (and isel will introduce v_readfirstlane for /// them if their operands can't be proven statically uniform). -/// -/// This pass is implemented as a ModulePass because intrinsic declarations -/// exist at the module scope, allowing us to skip processing entirely if no -/// declarations are present and to traverse their user lists directly when -/// they are. A FunctionPass would instead require scanning every instruction -/// in every function to find relevant intrinsics, which is far less efficient. //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -97,14 +91,12 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, Tracker[NotOp] = true; // NOT preserves uniformity LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n'); ICmp->replaceAllUsesWith(NotOp); - ICmp->eraseFromParent(); Changed = true; } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) { // Case: (icmp ne %ballot, 0) -> %ballot_arg LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: " << *Src << '\n'); ICmp->replaceAllUsesWith(Src); - ICmp->eraseFromParent(); Changed = true; } } @@ -120,15 +112,17 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return false; } -/// Iterates over intrinsic declarations in the module to optimize their uses. -static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { +/// Iterates over intrinsic calls in the Function to optimize. +static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { bool IsChanged = false; ValueMap<const Value *, bool> Tracker; - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - for (Function &F : M) { - switch (F.getIntrinsicID()) { + for (Instruction &I : make_early_inc_range(instructions(F))) { + auto *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + continue; + + switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: @@ -137,23 +131,61 @@ static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) { default: continue; } - - for (User *U : make_early_inc_range(F.users())) { - auto *II = cast<IntrinsicInst>(U); - Function *ParentF = II->getFunction(); - const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF); - IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); - } + IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; } PreservedAnalyses -AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) { - if (!runUniformIntrinsicCombine(M, AM)) +AMDGPUUniformIntrinsicCombinePass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &UI = AM.getResult<UniformityInfoAnalysis>(F); + if (!runUniformIntrinsicCombine(F, UI)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<UniformityInfoAnalysis>(); return PA; } + +namespace { +class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass { +public: + static char ID; + AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) { + initializeAMDGPUUniformIntrinsicCombineLegacyPass( + *PassRegistry::getPassRegistry()); + } + +private: + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + } +}; +} // namespace + +char AMDGPUUniformIntrinsicCombineLegacy::ID = 0; +char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID = + AMDGPUUniformIntrinsicCombineLegacy::ID; + +bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + const UniformityInfo &UI = + getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); + return runUniformIntrinsicCombine(F, UI); +} + +INITIALIZE_PASS_BEGIN(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUUniformIntrinsicCombineLegacy, DEBUG_TYPE, + "AMDGPU Uniform Intrinsic Combine", false, false) + +FunctionPass *llvm::createAMDGPUUniformIntrinsicCombineLegacyPass() { + return new AMDGPUUniformIntrinsicCombineLegacy(); +} diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 5580e4c..09338c5 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -9028,6 +9028,9 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); + // Parse a dummy operand as a placeholder for the SWZ operand. This enforces + // agreement between MCInstrDesc.getNumOperands and MCInst.getNumOperands. + Inst.addOperand(MCOperand::createImm(0)); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a911e7e..52cc4ca 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { return false; assert(!ST.hasExtendedWaitCounts()); - if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI)) + if (!ST.isWave64()) + return false; + + const bool IsSALU = SIInstrInfo::isSALU(*MI); + const bool IsVALU = SIInstrInfo::isVALU(*MI); + if (!IsSALU && !IsVALU) return false; // The hazard sequence is three instructions: // 1. VALU reads SGPR as mask - // 2. SALU writes SGPR - // 3. SALU reads SGPR - // The hazard can expire if the distance between 2 and 3 is sufficient. - // In practice this happens <10% of the time, hence this always assumes - // the hazard exists if 1 and 2 are present to avoid searching. + // 2. VALU/SALU writes SGPR + // 3. VALU/SALU reads SGPR + // The hazard can expire if the distance between 2 and 3 is sufficient, + // or (2) is VALU and (3) is SALU. + // In practice this happens <10% of the time, hence always assume the hazard + // exists if (1) and (2) are present to avoid searching all SGPR reads. - const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst); - if (!SDSTOp || !SDSTOp->isReg()) - return false; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IgnoreableSGPR = [](const Register Reg) { + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::M0: + case AMDGPU::SGPR_NULL: + case AMDGPU::SGPR_NULL64: + case AMDGPU::SCC: + return true; + default: + return false; + } + }; + auto IsVCC = [](const Register Reg) { + return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI; + }; + + struct StateType { + SmallSet<Register, 2> HazardSGPRs; + + static unsigned getHashValue(const StateType &State) { + return hash_combine_range(State.HazardSGPRs); + } + static bool isEqual(const StateType &LHS, const StateType &RHS) { + return LHS.HazardSGPRs == RHS.HazardSGPRs; + } + }; + + SmallVector<const MachineInstr *> WaitInstrs; + bool HasSGPRRead = false; + StateType InitialState; + + // Look for SGPR write. + MachineOperand *HazardDef = nullptr; + for (MachineOperand &Op : MI->operands()) { + if (!Op.isReg()) + continue; + if (Op.isDef() && HazardDef) + continue; + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) + continue; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + // Also check for SGPR reads. + if (Op.isUse()) { + HasSGPRRead = true; + continue; + } + + assert(!HazardDef); + HazardDef = &Op; + } - const Register HazardReg = SDSTOp->getReg(); - if (HazardReg == AMDGPU::EXEC || - HazardReg == AMDGPU::EXEC_LO || - HazardReg == AMDGPU::EXEC_HI || - HazardReg == AMDGPU::M0) + if (!HazardDef) return false; - auto IsHazardFn = [HazardReg, this](const MachineInstr &I) { + // Setup to track writes to individual SGPRs + const Register HazardReg = HazardDef->getReg(); + if (AMDGPU::SReg_32RegClass.contains(HazardReg)) { + InitialState.HazardSGPRs.insert(HazardReg); + } else { + assert(AMDGPU::SReg_64RegClass.contains(HazardReg)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0)); + InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1)); + } + + auto IsHazardFn = [&](StateType &State, const MachineInstr &I) { + if (State.HazardSGPRs.empty()) + return HazardExpired; + switch (I.getOpcode()) { case AMDGPU::V_ADDC_U32_e32: case AMDGPU::V_ADDC_U32_dpp: @@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { case AMDGPU::V_SUBB_U32_e32: case AMDGPU::V_SUBB_U32_dpp: case AMDGPU::V_SUBBREV_U32_e32: - case AMDGPU::V_SUBBREV_U32_dpp: + case AMDGPU::V_SUBBREV_U32_dpp: { // These implicitly read VCC as mask source. - return HazardReg == AMDGPU::VCC || - HazardReg == AMDGPU::VCC_LO || - HazardReg == AMDGPU::VCC_HI; + return IsVCC(HazardReg) ? HazardFound : NoHazardFound; + } case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_ADDC_U32_e64_dpp: case AMDGPU::V_CNDMASK_B16_t16_e64: @@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { // Only check mask register overlaps. const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2); assert(SSRCOp); - return TRI.regsOverlap(SSRCOp->getReg(), HazardReg); + bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg); + return Result ? HazardFound : NoHazardFound; } default: - return false; + return NoHazardFound; } }; - const MachineRegisterInfo &MRI = MF.getRegInfo(); - auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { - // s_waitcnt_depctr sa_sdst(0) mitigates hazard. - if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) - return true; - - // VALU access to any SGPR or literal constant other than HazardReg - // mitigates hazard. No need to check HazardReg here as this will - // only be called when !IsHazardFn. - if (!SIInstrInfo::isVALU(I)) - return false; - for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) { - const MachineOperand &Op = I.getOperand(OpNo); - if (Op.isReg()) { - Register OpReg = Op.getReg(); - // Only consider uses - if (!Op.isUse()) + const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst( + AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0), + 0); + auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) { + switch (I.getOpcode()) { + case AMDGPU::S_WAITCNT_DEPCTR: + // Record mergable waits within region of instructions free of SGPR reads. + if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() && + (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits) + WaitInstrs.push_back(&I); + break; + default: + // Update tracking of SGPR reads and writes. + for (auto &Op : I.operands()) { + if (!Op.isReg()) continue; - // Ignore EXEC - if (OpReg == AMDGPU::EXEC || - OpReg == AMDGPU::EXEC_LO || - OpReg == AMDGPU::EXEC_HI) + + Register Reg = Op.getReg(); + if (IgnoreableSGPR(Reg)) continue; - // Ignore all implicit uses except VCC - if (Op.isImplicit()) { - if (OpReg == AMDGPU::VCC || - OpReg == AMDGPU::VCC_LO || - OpReg == AMDGPU::VCC_HI) - return true; + if (!IsVCC(Reg)) { + if (Op.isImplicit()) + continue; + if (!TRI->isSGPRReg(MRI, Reg)) + continue; + } + if (Op.isUse()) { + HasSGPRRead = true; continue; } - if (TRI.isSGPRReg(MRI, OpReg)) - return true; - } else { - const MCInstrDesc &InstDesc = I.getDesc(); - const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; - if (!TII.isInlineConstant(Op, OpInfo)) - return true; + + // Stop tracking any SGPRs with writes on the basis that they will + // already have an appropriate wait inserted afterwards. + SmallVector<Register, 2> Found; + for (Register SGPR : State.HazardSGPRs) { + if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR)) + Found.push_back(SGPR); + } + for (Register SGPR : Found) + State.HazardSGPRs.erase(SGPR); } + break; } - return false; }; // Check for hazard - if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == - std::numeric_limits<int>::max()) + if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn, + MI->getParent(), + std::next(MI->getReverseIterator()))) return false; - auto NextMI = std::next(MI->getIterator()); + // Compute counter mask + unsigned DepCtr = + IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0) + : AMDGPU::DepCtr::encodeFieldVaSdst(0)) + : AMDGPU::DepCtr::encodeFieldSaSdst(0); + + // Try to merge previous waits into this one for regions with no SGPR reads. + if (!WaitInstrs.empty()) { + // Note: WaitInstrs contains const pointers, so walk backward from MI to + // obtain a mutable pointer to each instruction to be merged. + // This is expected to be a very short walk within the same block. + SmallVector<MachineInstr *> ToErase; + unsigned Found = 0; + for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(), + End = MI->getParent()->rend(); + Found < WaitInstrs.size() && It != End; ++It) { + MachineInstr *WaitMI = &*It; + // Find next wait instruction. + if (std::as_const(WaitMI) != WaitInstrs[Found]) + continue; + Found++; + unsigned WaitMask = WaitMI->getOperand(0).getImm(); + assert((WaitMask & ConstantMaskBits) == ConstantMaskBits); + DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask), + AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr))); + DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc( + DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask), + AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr))); + ToErase.push_back(WaitMI); + } + assert(Found == WaitInstrs.size()); + for (MachineInstr *WaitMI : ToErase) + WaitMI->eraseFromParent(); + } - // Add s_waitcnt_depctr sa_sdst(0) after SALU write. + // Add s_waitcnt_depctr after SGPR write. + auto NextMI = std::next(MI->getIterator()); auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + .addImm(DepCtr); // SALU write may be s_getpc in a bundle. updateGetPCBundle(NewMI); diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 5f6d742..d950131 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -877,69 +877,69 @@ multiclass MIMG_Store <mimgopc op, string asm, bit has_d16, bit mip = 0> { } class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx6789 <op, !if(noRtn, (outs), (outs data_rc:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$tfe$lwe$da"; } class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, string dns=""> - : MIMG_gfx90a <op, (outs getAlign2RegOp<data_rc>.ret:$vdst), dns> { - let Constraints = "$vdst = $vdata"; - + RegisterClass addr_rc, bit noRtn, string dns=""> + : MIMG_gfx90a <op, !if(noRtn, (outs), (outs getAlign2RegOp<data_rc>.ret:$vdst)), dns> { + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins getAlign2RegOp<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256_XNULL:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, R128A16:$r128, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$cpol$r128$lwe$da"; } class MIMG_Atomic_si<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.SI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX6GFX7", "")> { let AssemblerPredicate = isGFX6GFX7; } class MIMG_Atomic_vi<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX8", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx6789_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX8", "")> { let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterOperand data_rc, - RegisterClass addr_rc, bit enableDasm = 0> - : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, !if(enableDasm, "GFX90A", "")> { + RegisterClass addr_rc, bit noRtn = 0, bit enableDasm = 0> + : MIMG_Atomic_gfx90a_base<op.VI, asm, data_rc, addr_rc, noRtn, !if(enableDasm, "GFX90A", "")> { let AssemblerPredicate = isGFX90APlus; let MIMGEncoding = MIMGEncGfx90a; } class MIMG_Atomic_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx10<op.GFX10M, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX10", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -950,24 +950,24 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, class MIMG_Atomic_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, RegisterClass AddrRC, - bit enableDisasm = 0> - : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst), + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256_XNULL:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe, LWE:$lwe); - let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"; } class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, RegisterOperand DataRC, int num_addrs, - bit enableDisasm = 0> - : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs, + bit noRtn = 0, bit enableDisasm = 0> + : MIMG_nsa_gfx11<op.GFX11, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX11", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$srsrc, DMask:$dmask, @@ -977,11 +977,11 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, } class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterOperand DataRC, - int num_addrs, string renamed, bit enableDisasm = 0> - : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs, + int num_addrs, string renamed, bit noRtn = 0, bit enableDisasm = 0> + : VIMAGE_gfx12<op.GFX12, !if(noRtn, (outs), (outs DataRC:$vdst)), num_addrs, !if(enableDisasm, "GFX12", "")> { - let Constraints = "$vdst = $vdata"; - + let Constraints = !if(noRtn, "", "$vdst = $vdata"); + let isCodeGenOnly = noRtn; let InOperandList = !con((ins DataRC:$vdata), AddrIns, (ins SReg_256_XNULL:$rsrc, DMask:$dmask, Dim:$dim, @@ -994,95 +994,96 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, RegisterOperand data_rc, bit enableDasm = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { let hasSideEffects = 1, // FIXME: remove this mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1, - FPAtomic = isFP in { + FPAtomic = isFP, IsAtomicNoRet = noRtn in { let VAddrDwords = 1 in { let ssamp = 0 in { if op.HAS_SI then { - def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_VI then { - def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; let hasPostISelHook = 1 in - def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX10M then { - def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>; + def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed>; + def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, renamed, noRtn>; } } let VAddrDwords = 2 in { let ssamp = 0 in { if op.HAS_SI then { - def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>; + def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, noRtn, 0>; } if op.HAS_VI then { - def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>; - def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, 0>; + def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>; + def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, noRtn, 0>; } if op.HAS_GFX11 then { - def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>; - def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>; + def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, noRtn, 0>; + def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed>; + def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, renamed, noRtn>; } } let VAddrDwords = 3 in { let ssamp = 0 in { if op.HAS_SI then { - def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>; + def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, noRtn, 0>; } if op.HAS_VI then { - def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>; - def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, 0>; + def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>; + def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, noRtn, 0>; } if op.HAS_GFX11 then { - def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>; - def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>; + def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, noRtn, 0>; + def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, noRtn, 0>; } } if op.HAS_GFX12 then { - def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed>; + def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, renamed, noRtn>; } } let VAddrDwords = 4 in { let ssamp = 0 in { if op.HAS_SI then { - def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>; + def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, noRtn, 0>; } if op.HAS_VI then { - def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>; - def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, 0>; + def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128_Align2, noRtn, 0>; } if op.HAS_GFX10M then { - def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, noRtn, enableDasm>; } if op.HAS_GFX11 then { - def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>; - def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>; + def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, noRtn, 0>; + def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, noRtn, enableDasm>; } } if op.HAS_GFX12 then { - def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, enableDasm>; + def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, renamed, noRtn, enableDasm>; } } } @@ -1095,12 +1096,13 @@ multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm, } } -multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, - string renamed = ""> { // 64-bit atomics - let IsAtomicRet = 1 in { +multiclass MIMG_Atomic_Base <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + bit noRtn = 0, string renamed = ""> { // 64-bit atomics + let IsAtomicRet = !not(noRtn) in { def "" : MIMGBaseOpcode { let Atomic = 1; let AtomicX2 = isCmpSwap; + let NoReturn = noRtn; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { @@ -1109,22 +1111,28 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, // Other variants are reconstructed by disassembler using dmask and tfe. if !not(isCmpSwap) then { let VDataDwords = 1 in - defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, renamed>; + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_32, 1, isFP, noRtn, renamed>; } let VDataDwords = 2 in - defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, renamed>; + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_64, isCmpSwap, isFP, noRtn, renamed>; let VDataDwords = 3 in - defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, renamed>; + defm _V3 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_96, 0, isFP, noRtn, renamed>; if isCmpSwap then { let VDataDwords = 4 in - defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, renamed>; + defm _V4 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_128, 0, isFP, noRtn, renamed>; let VDataDwords = 5 in - defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, renamed>; + defm _V5 : MIMG_Atomic_Addr_Helper_m <op, asm, AVLdSt_160, 0, isFP, noRtn, renamed>; } } - } // End IsAtomicRet = 1 + } +} + +multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0, + string renamed = ""> { + defm "" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/0, renamed>; + defm "_NORTN" : MIMG_Atomic_Base <op, asm, isCmpSwap, isFP, /*noRtn=*/1, renamed>; } multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed, @@ -1820,6 +1828,7 @@ let SubtargetPredicate = isGFX12Plus in { class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { Intrinsic Intr = I; MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod)); + MIMGBaseOpcode AtomicNoRetBaseOpcode = BaseOpcode; AMDGPUDimProps Dim = I.P.Dim; AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>; @@ -1855,13 +1864,20 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0)); } +class ImageDimAtomicIntrinsicInfo<AMDGPUImageDimIntrinsic I> + : ImageDimIntrinsicInfo<I> { + MIMGBaseOpcode AtomicNoRetBaseOpcode = + !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod, "_NORTN")); +} + def ImageDimIntrinsicTable : GenericTable { let FilterClass = "ImageDimIntrinsicInfo"; - let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs", - "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd", - "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", + let Fields = ["Intr", "BaseOpcode", "AtomicNoRetBaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", + "NumVAddrs", "NumArgs", "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", + "VAddrEnd", "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex", "BiasTyArg", "GradientTyArg", "CoordTyArg"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; + string TypeOf_AtomicNoRetBaseOpcode = "MIMGBaseOpcode"; string TypeOf_Dim = "MIMGDim"; let PrimaryKey = ["Intr"]; @@ -1874,11 +1890,14 @@ def getImageDimIntrinsicByBaseOpcode : SearchIndex { let Key = ["BaseOpcode", "Dim"]; } -foreach intr = !listconcat(AMDGPUImageDimIntrinsics, - AMDGPUImageDimAtomicIntrinsics) in { +foreach intr = AMDGPUImageDimIntrinsics in { def : ImageDimIntrinsicInfo<intr>; } +foreach intr = AMDGPUImageDimAtomicIntrinsics in { + def : ImageDimAtomicIntrinsicInfo<intr>; +} + // L to LZ Optimization Mapping def : MIMGLZMapping<IMAGE_SAMPLE_L, IMAGE_SAMPLE_LZ>; def : MIMGLZMapping<IMAGE_SAMPLE_C_L, IMAGE_SAMPLE_C_LZ>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index be42291..b34ab2a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9134,16 +9134,23 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>(); + unsigned IntrOpcode = Intr->BaseOpcode; + // For image atomic: use no-return opcode if result is unused. + if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode && + !Op.getNode()->hasAnyUseOfValue(0)) + IntrOpcode = Intr->AtomicNoRetBaseOpcode; const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(IntrOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); - unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); SmallVector<EVT, 3> ResultTypes(Op->values()); SmallVector<EVT, 3> OrigResultTypes(Op->values()); + if (BaseOpcode->NoReturn && BaseOpcode->Atomic) + ResultTypes.erase(&ResultTypes[0]); + bool IsD16 = false; bool IsG16 = false; bool IsA16 = false; @@ -9162,8 +9169,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VData = Op.getOperand(2); IsAtomicPacked16Bit = - (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || - Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 || + IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN); bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { @@ -9173,7 +9182,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (Is64Bit) VData = DAG.getBitcast(MVT::v4i32, VData); - ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + if (!BaseOpcode->NoReturn) + ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + DMask = Is64Bit ? 0xf : 0x3; NumVDataDwords = Is64Bit ? 4 : 2; } else { @@ -9399,8 +9410,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); - if (BaseOpcode->Atomic) - CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + // Keep GLC only when the atomic's result is actually used. + if (BaseOpcode->Atomic && !BaseOpcode->NoReturn) + CPol |= AMDGPU::CPol::GLC; if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | AMDGPU::CPol::VOLATILE)) return Op; @@ -9512,13 +9524,20 @@ SDValue SITargetLowering::lowerImage(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); } + if (BaseOpcode->NoReturn) { + if (BaseOpcode->Atomic) + return DAG.getMergeValues( + {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL); + + return SDValue(NewNode, 0); + } + if (BaseOpcode->AtomicX2) { SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); } - if (BaseOpcode->NoReturn) - return SDValue(NewNode, 0); + return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, NumVDataDwords, IsAtomicPacked16Bit, DL); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ebd2e7e..d80a6f3 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1874,9 +1874,13 @@ void SIRegisterInfo::buildSpillLoadStore( } bool IsSrcDstDef = SrcDstRegState & RegState::Define; + bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore; if (NeedSuperRegImpOperand && - (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) + (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) { MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + if (PartialReloadCopy) + MIB.addReg(ValueReg, RegState::Implicit); + } // The epilog restore of a wwm-scratch register can cause undesired // optimization during machine-cp post PrologEpilogInserter if the same diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 84287b6..1931e0b 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -838,9 +838,10 @@ def S_CBRANCH_G_FORK : SOP2_Pseudo < let SubtargetPredicate = isGFX6GFX7GFX8GFX9; } -let Defs = [SCC] in { -def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">; -} // End Defs = [SCC] +let isCommutable = 1, Defs = [SCC] in +def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32", + [(set i32:$sdst, (UniformUnaryFrag<abs> (sub_oneuse i32:$src0, i32:$src1)))] +>; let SubtargetPredicate = isGFX8GFX9 in { def S_RFE_RESTORE_B64 : SOP2_Pseudo < |
