//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// This pass mainly lowers early terminate pseudo instructions. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachinePassManager.h" using namespace llvm; #define DEBUG_TYPE "si-late-branch-lowering" namespace { class SILateBranchLowering { private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; MachineDominatorTree *MDT = nullptr; void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST, bool DynamicVGPR); void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); public: SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {} bool run(MachineFunction &MF); unsigned MovOpc; Register ExecReg; }; class SILateBranchLoweringLegacy : public MachineFunctionPass { public: static char ID; SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { auto *MDT = &getAnalysis().getDomTree(); return SILateBranchLowering(MDT).run(MF); } StringRef getPassName() const override { return "SI Final Branch Preparation"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } }; } // end anonymous namespace char SILateBranchLoweringLegacy::ID = 0; INITIALIZE_PASS_BEGIN(SILateBranchLoweringLegacy, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(SILateBranchLoweringLegacy, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) char &llvm::SILateBranchLoweringPassID = SILateBranchLoweringLegacy::ID; static void generateEndPgm(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, const SIInstrInfo *TII, MachineFunction &MF) { const Function &F = MF.getFunction(); bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS; // Check if hardware has been configured to expect color or depth exports. bool HasColorExports = AMDGPU::getHasColorExport(F); bool HasDepthExports = AMDGPU::getHasDepthExport(F); bool HasExports = HasColorExports || HasDepthExports; // Prior to GFX10, hardware always expects at least one export for PS. bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget()); if (IsPS && (HasExports || MustExport)) { // Generate "null export" if hardware is expecting PS to export. const GCNSubtarget &ST = MBB.getParent()->getSubtarget(); int Target = ST.hasNullExportTarget() ? AMDGPU::Exp::ET_NULL : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ); BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) .addImm(Target) .addReg(AMDGPU::VGPR0, RegState::Undef) .addReg(AMDGPU::VGPR0, RegState::Undef) .addReg(AMDGPU::VGPR0, RegState::Undef) .addReg(AMDGPU::VGPR0, RegState::Undef) .addImm(1) // vm .addImm(0) // compr .addImm(0); // en } // s_endpgm BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); } static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT) { MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true); // Update dominator tree using DomTreeT = DomTreeBase; SmallVector DTUpdates; for (MachineBasicBlock *Succ : SplitBB->successors()) { DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); } DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); MDT->applyUpdates(DTUpdates); } static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB, MachineOperand &Op) { if (Op.isReg()) MIB.addReg(Op.getReg()); else MIB.add(Op); } void SILateBranchLowering::expandChainCall(MachineInstr &MI, const GCNSubtarget &ST, bool DynamicVGPR) { // This is a tail call that needs to be expanded into at least // 2 instructions, one for setting EXEC and one for the actual tail call. int ExecIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::exec); assert(ExecIdx != -1 && "Missing EXEC operand"); const DebugLoc &DL = MI.getDebugLoc(); if (DynamicVGPR) { // We have 3 extra operands and we need to: // * Try to change the VGPR allocation // * Select the callee based on the result of the reallocation attempt // * Select the EXEC mask based on the result of the reallocation attempt // If any of the register operands of the chain pseudo is used in more than // one of these instructions, we need to make sure that the kill flags // aren't copied along. auto AllocMI = BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_ALLOC_VGPR)); copyOpWithoutRegFlags(AllocMI, *TII->getNamedOperand(MI, AMDGPU::OpName::numvgprs)); auto SelectCallee = BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_CSELECT_B64)) .addDef(TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg()); copyOpWithoutRegFlags(SelectCallee, *TII->getNamedOperand(MI, AMDGPU::OpName::src0)); copyOpWithoutRegFlags(SelectCallee, *TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee)); auto SelectExec = BuildMI(*MI.getParent(), MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64)) .addDef(ExecReg); copyOpWithoutRegFlags(SelectExec, *TII->getNamedOperand(MI, AMDGPU::OpName::exec)); copyOpWithoutRegFlags(SelectExec, *TII->getNamedOperand(MI, AMDGPU::OpName::fbexec)); } else { auto SetExec = BuildMI(*MI.getParent(), MI, DL, TII->get(MovOpc), ExecReg); copyOpWithoutRegFlags(SetExec, *TII->getNamedOperand(MI, AMDGPU::OpName::exec)); } for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx) MI.removeOperand(OpIdx); MI.setDesc(TII->get(AMDGPU::SI_TCRETURN)); } void SILateBranchLowering::earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc DL = MI.getDebugLoc(); auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) .addMBB(EarlyExitBlock); auto Next = std::next(MI.getIterator()); if (Next != MBB.end() && !Next->isTerminator()) splitBlock(MBB, *BranchMI, MDT); MBB.addSuccessor(EarlyExitBlock); MDT->insertEdge(&MBB, EarlyExitBlock); } PreservedAnalyses llvm::SILateBranchLoweringPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { auto *MDT = &MFAM.getResult(MF); if (!SILateBranchLowering(MDT).run(MF)) return PreservedAnalyses::all(); return getMachineFunctionPassPreservedAnalyses() .preserve(); } bool SILateBranchLowering::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; SmallVector EarlyTermInstrs; SmallVector EpilogInstrs; bool MadeChange = false; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { switch (MI.getOpcode()) { case AMDGPU::S_BRANCH: // Optimize out branches to the next block. // This only occurs in -O0 when BranchFolding is not executed. if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { assert(&MI == &MBB.back()); MI.eraseFromParent(); MadeChange = true; } break; case AMDGPU::SI_CS_CHAIN_TC_W32: case AMDGPU::SI_CS_CHAIN_TC_W64: expandChainCall(MI, ST, /*DynamicVGPR=*/false); MadeChange = true; break; case AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR: case AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR: expandChainCall(MI, ST, /*DynamicVGPR=*/true); MadeChange = true; break; case AMDGPU::SI_EARLY_TERMINATE_SCC0: EarlyTermInstrs.push_back(&MI); break; case AMDGPU::SI_RETURN_TO_EPILOG: EpilogInstrs.push_back(&MI); break; default: break; } } } // Lower any early exit branches first if (!EarlyTermInstrs.empty()) { MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock(); DebugLoc DL; MF.insert(MF.end(), EarlyExitBlock); BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc), ExecReg) .addImm(0); generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF); for (MachineInstr *Instr : EarlyTermInstrs) { // Early termination in GS does nothing if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) earlyTerm(*Instr, EarlyExitBlock); Instr->eraseFromParent(); } EarlyTermInstrs.clear(); MadeChange = true; } // Now check return to epilog instructions occur at function end if (!EpilogInstrs.empty()) { MachineBasicBlock *EmptyMBBAtEnd = nullptr; assert(!MF.getInfo()->returnsVoid()); // If there are multiple returns to epilog then all will // become jumps to new empty end block. if (EpilogInstrs.size() > 1) { EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); MF.insert(MF.end(), EmptyMBBAtEnd); } for (auto *MI : EpilogInstrs) { auto *MBB = MI->getParent(); if (MBB == &MF.back() && MI == &MBB->back()) continue; // SI_RETURN_TO_EPILOG is not the last instruction. // Jump to empty block at function end. if (!EmptyMBBAtEnd) { EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); MF.insert(MF.end(), EmptyMBBAtEnd); } MBB->addSuccessor(EmptyMBBAtEnd); MDT->insertEdge(MBB, EmptyMBBAtEnd); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) .addMBB(EmptyMBBAtEnd); MI->eraseFromParent(); MadeChange = true; } EpilogInstrs.clear(); } return MadeChange; }