//===-- AMDGPURegBankLegalize.cpp -----------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// Lower G_ instructions that can't be inst-selected with register bank /// assignment from AMDGPURegBankSelect based on machine uniformity info. /// Given types on all operands, some register bank assignments require lowering /// while others do not. /// Note: cases where all register bank assignments would require lowering are /// lowered in legalizer. /// For example vgpr S64 G_AND requires lowering to S32 while sgpr S64 does not. /// Eliminate sgpr S1 by lowering to sgpr S32. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUGlobalISelUtils.h" #include "AMDGPURegBankLegalizeHelper.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-regbanklegalize" using namespace llvm; using namespace AMDGPU; namespace { class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; public: AMDGPURegBankLegalize() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "AMDGPU Register Bank Legalize"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } // If there were no phis and we do waterfall expansion machine verifier would // fail. MachineFunctionProperties getClearedProperties() const override { return MachineFunctionProperties().setNoPHIs(); } }; } // End anonymous namespace. INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE, "AMDGPU Register Bank Legalize", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE, "AMDGPU Register Bank Legalize", false, false) char AMDGPURegBankLegalize::ID = 0; char &llvm::AMDGPURegBankLegalizeID = AMDGPURegBankLegalize::ID; FunctionPass *llvm::createAMDGPURegBankLegalizePass() { return new AMDGPURegBankLegalize(); } const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI) { static std::mutex GlobalMutex; static SmallDenseMap> CacheForRuleSet; std::lock_guard Lock(GlobalMutex); auto [It, Inserted] = CacheForRuleSet.try_emplace(ST.getGeneration()); if (Inserted) It->second = std::make_unique(ST, MRI); else It->second->refreshRefs(ST, MRI); return *It->second; } class AMDGPURegBankLegalizeCombiner { MachineIRBuilder &B; MachineRegisterInfo &MRI; const SIRegisterInfo &TRI; const RegisterBank *SgprRB; const RegisterBank *VgprRB; const RegisterBank *VccRB; static constexpr LLT S1 = LLT::scalar(1); static constexpr LLT S16 = LLT::scalar(16); static constexpr LLT S32 = LLT::scalar(32); static constexpr LLT S64 = LLT::scalar(64); public: AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) : B(B), MRI(*B.getMRI()), TRI(TRI), SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; bool isLaneMask(Register Reg); std::pair tryMatch(Register Src, unsigned Opcode); std::pair tryMatchRALFromUnmerge(Register Src); Register getReadAnyLaneSrc(Register Src); void replaceRegWithOrBuildCopy(Register Dst, Register Src); bool tryEliminateReadAnyLane(MachineInstr &Copy); void tryCombineCopy(MachineInstr &MI); void tryCombineS1AnyExt(MachineInstr &MI); }; bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) { const RegisterBank *RB = MRI.getRegBankOrNull(Reg); if (RB && RB->getID() == AMDGPU::VCCRegBankID) return true; const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); } std::pair AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { MachineInstr *MatchMI = MRI.getVRegDef(Src); if (MatchMI->getOpcode() != Opcode) return {nullptr, Register()}; return {MatchMI, MatchMI->getOperand(1).getReg()}; } std::pair AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { MachineInstr *ReadAnyLane = MRI.getVRegDef(Src); if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE) return {nullptr, -1}; Register RALSrc = ReadAnyLane->getOperand(1).getReg(); if (auto *UnMerge = getOpcodeDef(RALSrc, MRI)) return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)}; return {nullptr, -1}; } Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { // Src = G_AMDGPU_READANYLANE RALSrc auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); if (RAL) return RALSrc; // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc // LoSgpr = G_AMDGPU_READANYLANE LoVgpr // HiSgpr = G_AMDGPU_READANYLANE HiVgpr // Src G_MERGE_VALUES LoSgpr, HiSgpr auto *Merge = getOpcodeDef(Src, MRI); if (Merge) { unsigned NumElts = Merge->getNumSources(); auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0)); if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0) return {}; // Check if all elements are from same unmerge and there is no shuffling. for (unsigned i = 1; i < NumElts; ++i) { auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i)); if (UnmergeI != Unmerge || (unsigned)IdxI != i) return {}; } return Unmerge->getSourceReg(); } // SrcRegIdx = G_AMDGPU_READANYLANE RALElSrc // SourceReg G_MERGE_VALUES ..., SrcRegIdx, ... // ..., Src, ... = G_UNMERGE_VALUES SourceReg auto *UnMerge = getOpcodeDef(Src, MRI); if (!UnMerge) return {}; int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr); Merge = getOpcodeDef(UnMerge->getSourceReg(), MRI); if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources()) return {}; Register SrcRegIdx = Merge->getSourceReg(Idx); if (MRI.getType(Src) != MRI.getType(SrcRegIdx)) return {}; auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE); if (RALEl) return RALElSrc; return {}; } void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst, Register Src) { if (Dst.isVirtual()) MRI.replaceRegWith(Dst, Src); else B.buildCopy(Dst, Src); } bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane( MachineInstr &Copy) { Register Dst = Copy.getOperand(0).getReg(); Register Src = Copy.getOperand(1).getReg(); // Skip non-vgpr Dst if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB) : !TRI.isVGPR(MRI, Dst)) return false; // Skip physical source registers and source registers with register class if (!Src.isVirtual() || MRI.getRegClassOrNull(Src)) return false; Register RALDst = Src; MachineInstr &SrcMI = *MRI.getVRegDef(Src); if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) RALDst = SrcMI.getOperand(1).getReg(); Register RALSrc = getReadAnyLaneSrc(RALDst); if (!RALSrc) return false; B.setInstr(Copy); if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) { // Src = READANYLANE RALSrc Src = READANYLANE RALSrc // Dst = Copy Src $Dst = Copy Src // -> -> // Dst = RALSrc $Dst = Copy RALSrc replaceRegWithOrBuildCopy(Dst, RALSrc); } else { // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc // Src = G_BITCAST RALDst Src = G_BITCAST RALDst // Dst = Copy Src Dst = Copy Src // -> -> // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst // Dst = NewVgpr $Dst = Copy NewVgpr auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc); replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0)); } eraseInstr(Copy, MRI); return true; } void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) { if (tryEliminateReadAnyLane(MI)) return; Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); // Skip copies of physical registers. if (!Dst.isVirtual() || !Src.isVirtual()) return; // This is a cross bank copy, sgpr S1 to lane mask. // // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) // -> // %BoolSrc:sgpr(s32) = G_AND %TruncS32Src:sgpr(s32), 1 // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %BoolSrc:sgpr(s32) if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); assert(Trunc && MRI.getType(TruncS32Src) == S32 && "sgpr S1 must be result of G_TRUNC of sgpr S32"); B.setInstr(MI); // Ensure that truncated bits in BoolSrc are 0. auto One = B.buildConstant({SgprRB, S32}, 1); auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); eraseInstr(MI, MRI); } } void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) { // %Src:sgpr(S1) = G_TRUNC %TruncSrc // %Dst = G_ANYEXT %Src:sgpr(S1) // -> // %Dst = G_... %TruncSrc Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); if (MRI.getType(Src) != S1) return; auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); if (!Trunc) return; LLT DstTy = MRI.getType(Dst); LLT TruncSrcTy = MRI.getType(TruncSrc); if (DstTy == TruncSrcTy) { MRI.replaceRegWith(Dst, TruncSrc); eraseInstr(MI, MRI); return; } B.setInstr(MI); if (DstTy == S32 && TruncSrcTy == S64) { auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); MRI.replaceRegWith(Dst, Unmerge.getReg(0)); eraseInstr(MI, MRI); return; } if (DstTy == S64 && TruncSrcTy == S32) { B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {TruncSrc, B.buildUndef({SgprRB, S32})}); eraseInstr(MI, MRI); return; } if (DstTy == S32 && TruncSrcTy == S16) { B.buildAnyExt(Dst, TruncSrc); eraseInstr(MI, MRI); return; } if (DstTy == S16 && TruncSrcTy == S32) { B.buildTrunc(Dst, TruncSrc); eraseInstr(MI, MRI); return; } llvm_unreachable("missing anyext + trunc combine"); } // Search through MRI for virtual registers with sgpr register bank and S1 LLT. [[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { const LLT S1 = LLT::scalar(1); for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) { Register Reg = Register::index2VirtReg(i); if (MRI.def_empty(Reg) || MRI.getType(Reg) != S1) continue; const RegisterBank *RB = MRI.getRegBankOrNull(Reg); if (RB && RB->getID() == AMDGPU::SGPRRegBankID) { LLVM_DEBUG(dbgs() << "Warning: detected sgpr S1 register in: "; MRI.getVRegDef(Reg)->dump();); return Reg; } } return {}; } bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { if (MF.getProperties().hasFailedISel()) return false; // Setup the instruction builder with CSE. const TargetPassConfig &TPC = getAnalysis(); GISelCSEAnalysisWrapper &Wrapper = getAnalysis().getCSEWrapper(); GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig()); GISelObserverWrapper Observer; Observer.addObserver(&CSEInfo); CSEMIRBuilder B(MF); B.setCSEInfo(&CSEInfo); B.setChangeObserver(Observer); RAIIDelegateInstaller DelegateInstaller(MF, &Observer); RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); const GCNSubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); const RegisterBankInfo &RBI = *ST.getRegBankInfo(); const MachineUniformityInfo &MUI = getAnalysis().getUniformityInfo(); // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); // Logic that does legalization based on IDs assigned to Opcode. RegBankLegalizeHelper RBLHelper(B, MUI, RBI, RBLRules); SmallVector AllInst; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { AllInst.push_back(&MI); } } for (MachineInstr *MI : AllInst) { if (!MI->isPreISelOpcode()) continue; unsigned Opc = MI->getOpcode(); // Insert point for use operands needs some calculation. if (Opc == AMDGPU::G_PHI) { RBLHelper.applyMappingPHI(*MI); continue; } // Opcodes that support pretty much all combinations of reg banks and LLTs // (except S1). There is no point in writing rules for them. if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) { RBLHelper.applyMappingTrivial(*MI); continue; } // Opcodes that also support S1. if (Opc == G_FREEZE && MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) { RBLHelper.applyMappingTrivial(*MI); continue; } if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT || Opc == AMDGPU::G_IMPLICIT_DEF)) { Register Dst = MI->getOperand(0).getReg(); // Non S1 types are trivially accepted. if (MRI.getType(Dst) != LLT::scalar(1)) { assert(MRI.getRegBank(Dst)->getID() == AMDGPU::SGPRRegBankID); continue; } // S1 rules are in RegBankLegalizeRules. } RBLHelper.findRuleAndApplyMapping(*MI); } // Sgpr S1 clean up combines: // - Sgpr S1(S32) to sgpr S1(S32) Copy: anyext + trunc combine. // In RegBankLegalize 'S1 Dst' are legalized into S32 as // 'S1Dst = Trunc S32Dst' and 'S1 Src' into 'S32Src = Anyext S1Src'. // S1 Truncs and Anyexts that come from legalizer, that can have non-S32 // types e.g. S16 = Anyext S1 or S1 = Trunc S64, will also be cleaned up. // - Sgpr S1(S32) to vcc Copy: G_AMDGPU_COPY_VCC_SCC combine. // Divergent instruction uses sgpr S1 as input that should be lane mask(vcc) // Legalizing this use creates sgpr S1(S32) to vcc Copy. // Note: Remaining S1 copies, S1s are either sgpr S1(S32) or vcc S1: // - Vcc to vcc Copy: nothing to do here, just a regular copy. // - Vcc to sgpr S1 Copy: Should not exist in a form of COPY instruction(*). // Note: For 'uniform-in-vcc to sgpr-S1 copy' G_AMDGPU_COPY_SCC_VCC is used // instead. When only available instruction creates vcc result, use of // UniformInVcc results in creating G_AMDGPU_COPY_SCC_VCC. // (*)Explanation for 'sgpr S1(uniform) = COPY vcc(divergent)': // Copy from divergent to uniform register indicates an error in either: // - Uniformity analysis: Uniform instruction has divergent input. If one of // the inputs is divergent, instruction should be divergent! // - RegBankLegalizer not executing in waterfall loop (missing implementation) AMDGPURegBankLegalizeCombiner Combiner(B, *ST.getRegisterInfo(), RBI); for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : make_early_inc_range(MBB)) { if (MI.getOpcode() == AMDGPU::COPY) { Combiner.tryCombineCopy(MI); continue; } if (MI.getOpcode() == AMDGPU::G_ANYEXT) { Combiner.tryCombineS1AnyExt(MI); continue; } } } assert(!getAnySgprS1(MRI).isValid() && "Registers with sgpr reg bank and S1 LLT are not legal after " "AMDGPURegBankLegalize. Should lower to sgpr S32"); return true; }