//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// GlobalISel pass that selects divergent i1 phis as lane mask phis. /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies. /// Handles all cases of temporal divergence. /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass /// currently depends on LCSSA to insert phis with one incoming. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUGlobalISelUtils.h" #include "SILowerI1Copies.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering" using namespace llvm; namespace { class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { public: static char ID; public: AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "AMDGPU GlobalISel divergence lowering"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; class DivergenceLoweringHelper : public PhiLoweringHelper { public: DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI); private: MachineUniformityInfo *MUI = nullptr; MachineIRBuilder B; Register buildRegCopyToLaneMask(Register Reg); public: void markAsLaneMask(Register DstReg) const override; void getCandidatesForLowering( SmallVectorImpl &Vreg1Phis) const override; void collectIncomingValuesFromPhi( const MachineInstr *MI, SmallVectorImpl &Incomings) const override; void replaceDstReg(Register NewReg, Register OldReg, MachineBasicBlock *MBB) override; void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg) override; void constrainAsLaneMask(Incoming &In) override; bool lowerTemporalDivergence(); bool lowerTemporalDivergenceI1(); }; DivergenceLoweringHelper::DivergenceLoweringHelper( MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI) : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {} // _(s1) -> SReg_32/64(s1) void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const { assert(MRI->getType(DstReg) == LLT::scalar(1)); if (MRI->getRegClassOrNull(DstReg)) { if (MRI->constrainRegClass(DstReg, ST->getBoolRC())) return; llvm_unreachable("Failed to constrain register class"); } MRI->setRegClass(DstReg, ST->getBoolRC()); } void DivergenceLoweringHelper::getCandidatesForLowering( SmallVectorImpl &Vreg1Phis) const { LLT S1 = LLT::scalar(1); // Add divergent i1 phis to the list for (MachineBasicBlock &MBB : *MF) { for (MachineInstr &MI : MBB.phis()) { Register Dst = MI.getOperand(0).getReg(); if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst)) Vreg1Phis.push_back(&MI); } } } void DivergenceLoweringHelper::collectIncomingValuesFromPhi( const MachineInstr *MI, SmallVectorImpl &Incomings) const { for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { Incomings.emplace_back(MI->getOperand(i).getReg(), MI->getOperand(i + 1).getMBB(), Register()); } } void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg, MachineBasicBlock *MBB) { BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg) .addReg(NewReg); } // Copy Reg to new lane mask register, insert a copy after instruction that // defines Reg while skipping phis if needed. Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) { Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs); MachineInstr *Instr = MRI->getVRegDef(Reg); MachineBasicBlock *MBB = Instr->getParent(); B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator()))); B.buildCopy(LaneMask, Reg); return LaneMask; } // bb.previous // %PrevReg = ... // // bb.current // %CurReg = ... // // %DstReg - not defined // // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) // // bb.previous // %PrevReg = ... // %PrevRegCopy:sreg_32(s1) = COPY %PrevReg // // bb.current // %CurReg = ... // %CurRegCopy:sreg_32(s1) = COPY %CurReg // ... // %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 // %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 // %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg // // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg void DivergenceLoweringHelper::buildMergeLaneMasks( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg) { // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC) // TODO: check if inputs are constants or results of a compare. Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg); Register CurRegCopy = buildRegCopyToLaneMask(CurReg); Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); B.setInsertPt(MBB, I); B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg}); B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy}); B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg}); } // GlobalISel has to constrain S1 incoming taken as-is with lane mask register // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block, // Incoming.Reg becomes that new lane mask. void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { B.setInsertPt(*In.Block, In.Block->getFirstTerminator()); auto Copy = B.buildCopy(LLT::scalar(1), In.Reg); MRI->setRegClass(Copy.getReg(0), ST->getBoolRC()); In.Reg = Copy.getReg(0); } void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, Register NewReg) { for (MachineOperand &Op : Inst->operands()) { if (Op.isReg() && Op.getReg() == Reg) Op.setReg(NewReg); } } bool DivergenceLoweringHelper::lowerTemporalDivergence() { AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); DenseMap TDCache; for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) { if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || ILMA.isS32S64LaneMask(Reg)) continue; Register CachedTDCopy = TDCache.lookup(Reg); if (CachedTDCopy) { replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy); continue; } MachineInstr *Inst = MRI->getVRegDef(Reg); MachineBasicBlock *MBB = Inst->getParent(); B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator()))); Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg}) .addUse(ExecReg, RegState::Implicit); replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg); TDCache[Reg] = VgprReg; } return false; } bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; initializeLaneMaskRegisterAttributes(BoolS1); MachineSSAUpdater SSAUpdater(*MF); // In case of use outside muliple nested cycles or muliple uses we only need // to merge lane mask across largest relevant cycle. SmallDenseMap> LRCCache; for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { if (MRI->getType(Reg) != LLT::scalar(1)) continue; auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg); auto &CycleMergedMask = LRCCacheIter->getSecond(); const MachineCycle *&CachedLRC = CycleMergedMask.first; if (RegNotCached || LRC->contains(CachedLRC)) { CachedLRC = LRC; } } for (auto &LRCCacheEntry : LRCCache) { Register Reg = LRCCacheEntry.first; auto &CycleMergedMask = LRCCacheEntry.getSecond(); const MachineCycle *Cycle = CycleMergedMask.first; Register MergedMask = MRI->createVirtualRegister(BoolS1); SSAUpdater.Initialize(MergedMask); MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent(); SSAUpdater.AddAvailableValue(MBB, MergedMask); for (auto Entry : Cycle->getEntries()) { for (MachineBasicBlock *Pred : Entry->predecessors()) { if (!Cycle->contains(Pred)) { B.setInsertPt(*Pred, Pred->getFirstTerminator()); auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {}); SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0)); } } } buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask, SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg); CycleMergedMask.second = MergedMask; } for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) { if (MRI->getType(Reg) != LLT::scalar(1)) continue; replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second); } return false; } } // End anonymous namespace. INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, "AMDGPU GlobalISel divergence lowering", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, "AMDGPU GlobalISel divergence lowering", false, false) char AMDGPUGlobalISelDivergenceLowering::ID = 0; char &llvm::AMDGPUGlobalISelDivergenceLoweringID = AMDGPUGlobalISelDivergenceLowering::ID; FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { return new AMDGPUGlobalISelDivergenceLowering(); } bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( MachineFunction &MF) { MachineDominatorTree &DT = getAnalysis().getDomTree(); MachinePostDominatorTree &PDT = getAnalysis().getPostDomTree(); MachineUniformityInfo &MUI = getAnalysis().getUniformityInfo(); DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI); bool Changed = false; // Temporal divergence lowering needs to inspect list of instructions used // outside cycle with divergent exit provided by uniformity analysis. Uniform // instructions from the list require lowering, no instruction is deleted. // Thus it needs to be run before lowerPhis that deletes phis that require // lowering and replaces them with new instructions. // Non-i1 temporal divergence lowering. Changed |= Helper.lowerTemporalDivergence(); // This covers both uniform and divergent i1s. Lane masks are in sgpr and need // to be updated in each iteration. Changed |= Helper.lowerTemporalDivergenceI1(); // Temporal divergence lowering of divergent i1 phi used outside of the cycle // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1 // since in some case lowerPhis does unnecessary lane mask merging. Changed |= Helper.lowerPhis(); return Changed; }