//===-- AMDGPURegBankLegalize.cpp -----------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// Lower G_ instructions that can't be inst-selected with register bank /// assignment from AMDGPURegBankSelect based on machine uniformity info. /// Given types on all operands, some register bank assignments require lowering /// while others do not. /// Note: cases where all register bank assignments would require lowering are /// lowered in legalizer. /// For example vgpr S64 G_AND requires lowering to S32 while sgpr S64 does not. /// Eliminate sgpr S1 by lowering to sgpr S32. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUGlobalISelUtils.h" #include "AMDGPURegBankLegalizeHelper.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-regbanklegalize" using namespace llvm; using namespace AMDGPU; namespace { class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; public: AMDGPURegBankLegalize() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "AMDGPU Register Bank Legalize"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } // If there were no phis and we do waterfall expansion machine verifier would // fail. MachineFunctionProperties getClearedProperties() const override { return MachineFunctionProperties().setNoPHIs(); } }; } // End anonymous namespace. INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE, "AMDGPU Register Bank Legalize", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE, "AMDGPU Register Bank Legalize", false, false) char AMDGPURegBankLegalize::ID = 0; char &llvm::AMDGPURegBankLegalizeID = AMDGPURegBankLegalize::ID; FunctionPass *llvm::createAMDGPURegBankLegalizePass() { return new AMDGPURegBankLegalize(); } const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI) { static std::mutex GlobalMutex; static SmallDenseMap> CacheForRuleSet; std::lock_guard Lock(GlobalMutex); auto [It, Inserted] = CacheForRuleSet.try_emplace(ST.getGeneration()); if (Inserted) It->second = std::make_unique(ST, MRI); else It->second->refreshRefs(ST, MRI); return *It->second; } class AMDGPURegBankLegalizeCombiner { MachineIRBuilder &B; MachineRegisterInfo &MRI; const SIRegisterInfo &TRI; const RegisterBank *SgprRB; const RegisterBank *VgprRB; const RegisterBank *VccRB; static constexpr LLT S1 = LLT::scalar(1); static constexpr LLT S16 = LLT::scalar(16); static constexpr LLT S32 = LLT::scalar(32); static constexpr LLT S64 = LLT::scalar(64); public: AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) : B(B), MRI(*B.getMRI()), TRI(TRI), SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; bool isLaneMask(Register Reg) { const RegisterBank *RB = MRI.getRegBankOrNull(Reg); if (RB && RB->getID() == AMDGPU::VCCRegBankID) return true; const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); } void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { MI.eraseFromParent(); if (Optional0 && isTriviallyDead(*Optional0, MRI)) Optional0->eraseFromParent(); } std::pair tryMatch(Register Src, unsigned Opcode) { MachineInstr *MatchMI = MRI.getVRegDef(Src); if (MatchMI->getOpcode() != Opcode) return {nullptr, Register()}; return {MatchMI, MatchMI->getOperand(1).getReg()}; } void tryCombineCopy(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); // Skip copies of physical registers. if (!Dst.isVirtual() || !Src.isVirtual()) return; // This is a cross bank copy, sgpr S1 to lane mask. // // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) // -> // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); assert(Trunc && MRI.getType(TruncS32Src) == S32 && "sgpr S1 must be result of G_TRUNC of sgpr S32"); B.setInstr(MI); // Ensure that truncated bits in BoolSrc are 0. auto One = B.buildConstant({SgprRB, S32}, 1); auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); cleanUpAfterCombine(MI, Trunc); return; } // Src = G_AMDGPU_READANYLANE RALSrc // Dst = COPY Src // -> // Dst = RALSrc if (MRI.getRegBankOrNull(Dst) == VgprRB && MRI.getRegBankOrNull(Src) == SgprRB) { auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); if (!RAL) return; assert(MRI.getRegBank(RALSrc) == VgprRB); MRI.replaceRegWith(Dst, RALSrc); cleanUpAfterCombine(MI, RAL); return; } } void tryCombineS1AnyExt(MachineInstr &MI) { // %Src:sgpr(S1) = G_TRUNC %TruncSrc // %Dst = G_ANYEXT %Src:sgpr(S1) // -> // %Dst = G_... %TruncSrc Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); if (MRI.getType(Src) != S1) return; auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); if (!Trunc) return; LLT DstTy = MRI.getType(Dst); LLT TruncSrcTy = MRI.getType(TruncSrc); if (DstTy == TruncSrcTy) { MRI.replaceRegWith(Dst, TruncSrc); cleanUpAfterCombine(MI, Trunc); return; } B.setInstr(MI); if (DstTy == S32 && TruncSrcTy == S64) { auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); MRI.replaceRegWith(Dst, Unmerge.getReg(0)); cleanUpAfterCombine(MI, Trunc); return; } if (DstTy == S64 && TruncSrcTy == S32) { B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {TruncSrc, B.buildUndef({SgprRB, S32})}); cleanUpAfterCombine(MI, Trunc); return; } if (DstTy == S32 && TruncSrcTy == S16) { B.buildAnyExt(Dst, TruncSrc); cleanUpAfterCombine(MI, Trunc); return; } if (DstTy == S16 && TruncSrcTy == S32) { B.buildTrunc(Dst, TruncSrc); cleanUpAfterCombine(MI, Trunc); return; } llvm_unreachable("missing anyext + trunc combine"); } }; // Search through MRI for virtual registers with sgpr register bank and S1 LLT. [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { const LLT S1 = LLT::scalar(1); for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) { Register Reg = Register::index2VirtReg(i); if (MRI.def_empty(Reg) || MRI.getType(Reg) != S1) continue; const RegisterBank *RB = MRI.getRegBankOrNull(Reg); if (RB && RB->getID() == AMDGPU::SGPRRegBankID) { LLVM_DEBUG(dbgs() << "Warning: detected sgpr S1 register in: "; MRI.getVRegDef(Reg)->dump();); return Reg; } } return {}; } bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { if (MF.getProperties().hasFailedISel()) return false; // Setup the instruction builder with CSE. const TargetPassConfig &TPC = getAnalysis(); GISelCSEAnalysisWrapper &Wrapper = getAnalysis().getCSEWrapper(); GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig()); GISelObserverWrapper Observer; Observer.addObserver(&CSEInfo); CSEMIRBuilder B(MF); B.setCSEInfo(&CSEInfo); B.setChangeObserver(Observer); RAIIDelegateInstaller DelegateInstaller(MF, &Observer); RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); const GCNSubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); const RegisterBankInfo &RBI = *ST.getRegBankInfo(); const MachineUniformityInfo &MUI = getAnalysis().getUniformityInfo(); // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); // Logic that does legalization based on IDs assigned to Opcode. RegBankLegalizeHelper RBLHelper(B, MUI, RBI, RBLRules); SmallVector AllInst; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { AllInst.push_back(&MI); } } for (MachineInstr *MI : AllInst) { if (!MI->isPreISelOpcode()) continue; unsigned Opc = MI->getOpcode(); // Insert point for use operands needs some calculation. if (Opc == AMDGPU::G_PHI) { RBLHelper.applyMappingPHI(*MI); continue; } // Opcodes that support pretty much all combinations of reg banks and LLTs // (except S1). There is no point in writing rules for them. if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) { RBLHelper.applyMappingTrivial(*MI); continue; } // Opcodes that also support S1. if (Opc == G_FREEZE && MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) { RBLHelper.applyMappingTrivial(*MI); continue; } if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT || Opc == AMDGPU::G_IMPLICIT_DEF)) { Register Dst = MI->getOperand(0).getReg(); // Non S1 types are trivially accepted. if (MRI.getType(Dst) != LLT::scalar(1)) { assert(MRI.getRegBank(Dst)->getID() == AMDGPU::SGPRRegBankID); continue; } // S1 rules are in RegBankLegalizeRules. } RBLHelper.findRuleAndApplyMapping(*MI); } // Sgpr S1 clean up combines: // - Sgpr S1(S32) to sgpr S1(S32) Copy: anyext + trunc combine. // In RegBankLegalize 'S1 Dst' are legalized into S32 as // 'S1Dst = Trunc S32Dst' and 'S1 Src' into 'S32Src = Anyext S1Src'. // S1 Truncs and Anyexts that come from legalizer, that can have non-S32 // types e.g. S16 = Anyext S1 or S1 = Trunc S64, will also be cleaned up. // - Sgpr S1(S32) to vcc Copy: G_AMDGPU_COPY_VCC_SCC combine. // Divergent instruction uses sgpr S1 as input that should be lane mask(vcc) // Legalizing this use creates sgpr S1(S32) to vcc Copy. // Note: Remaining S1 copies, S1s are either sgpr S1(S32) or vcc S1: // - Vcc to vcc Copy: nothing to do here, just a regular copy. // - Vcc to sgpr S1 Copy: Should not exist in a form of COPY instruction(*). // Note: For 'uniform-in-vcc to sgpr-S1 copy' G_AMDGPU_COPY_SCC_VCC is used // instead. When only available instruction creates vcc result, use of // UniformInVcc results in creating G_AMDGPU_COPY_SCC_VCC. // (*)Explanation for 'sgpr S1(uniform) = COPY vcc(divergent)': // Copy from divergent to uniform register indicates an error in either: // - Uniformity analysis: Uniform instruction has divergent input. If one of // the inputs is divergent, instruction should be divergent! // - RegBankLegalizer not executing in waterfall loop (missing implementation) AMDGPURegBankLegalizeCombiner Combiner(B, *ST.getRegisterInfo(), RBI); for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : make_early_inc_range(MBB)) { if (MI.getOpcode() == AMDGPU::COPY) { Combiner.tryCombineCopy(MI); continue; } if (MI.getOpcode() == AMDGPU::G_ANYEXT) { Combiner.tryCombineS1AnyExt(MI); continue; } } } assert(!getAnySgprS1(MRI).isValid() && "Registers with sgpr reg bank and S1 LLT are not legal after " "AMDGPURegBankLegalize. Should lower to sgpr S32"); return true; }