diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIFrameLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 87 |
1 files changed, 77 insertions, 10 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 6a38679..11552b3 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -946,8 +946,18 @@ static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); - ScratchExecCopy = findScratchNonCalleeSaveRegister( - MRI, LiveUnits, *TRI.getWaveMaskRegClass()); + if (FuncInfo->isWholeWaveFunction()) { + // Whole wave functions already have a copy of the original EXEC mask that + // we can use. + assert(IsProlog && "Epilog should look at return, not setup"); + ScratchExecCopy = + TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg(); + assert(ScratchExecCopy && "Couldn't find copy of EXEC"); + } else { + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveUnits, *TRI.getWaveMaskRegClass()); + } + if (!ScratchExecCopy) report_fatal_error("failed to find free scratch register"); @@ -996,10 +1006,15 @@ void SIFrameLowering::emitCSRSpillStores( }; StoreWWMRegisters(WWMScratchRegs); + + auto EnableAllLanes = [&]() { + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); + }; + if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); + EnableAllLanes(); } else { ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, @@ -1008,7 +1023,18 @@ void SIFrameLowering::emitCSRSpillStores( } StoreWWMRegisters(WWMCalleeSavedRegs); - if (ScratchExecCopy) { + if (FuncInfo->isWholeWaveFunction()) { + // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove + // it now. If we have already saved some WWM CSR registers, then the EXEC is + // already -1 and we don't need to do anything else. Otherwise, set EXEC to + // -1 here. + if (!ScratchExecCopy) + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, + /*EnableInactiveLanes*/ true); + else if (WWMCalleeSavedRegs.empty()) + EnableAllLanes(); + TII->getWholeWaveFunctionSetup(MF)->eraseFromParent(); + } else if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) @@ -1083,11 +1109,6 @@ void SIFrameLowering::emitCSRSpillRestores( Register ScratchExecCopy; SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); - if (!WWMScratchRegs.empty()) - ScratchExecCopy = - buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, - /*IsProlog*/ false, /*EnableInactiveLanes*/ true); - auto RestoreWWMRegisters = [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { for (const auto &Reg : WWMRegs) { @@ -1098,6 +1119,36 @@ void SIFrameLowering::emitCSRSpillRestores( } }; + if (FuncInfo->isWholeWaveFunction()) { + // For whole wave functions, the EXEC is already -1 at this point. + // Therefore, we can restore the CSR WWM registers right away. + RestoreWWMRegisters(WWMCalleeSavedRegs); + + // The original EXEC is the first operand of the return instruction. + const MachineInstr &Return = MBB.instr_back(); + assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN && + "Unexpected return inst"); + Register OrigExec = Return.getOperand(0).getReg(); + + if (!WWMScratchRegs.empty()) { + unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64; + BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec()) + .addReg(OrigExec) + .addImm(-1); + RestoreWWMRegisters(WWMScratchRegs); + } + + // Restore original EXEC. + unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec); + return; + } + + if (!WWMScratchRegs.empty()) { + ScratchExecCopy = + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, + /*IsProlog=*/false, /*EnableInactiveLanes=*/true); + } RestoreWWMRegisters(WWMScratchRegs); if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { @@ -1634,6 +1685,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN || (MFI->isChainFunction() && TII->isChainCallOpcode(MI.getOpcode()))) { // We expect all return to be the same size. @@ -1662,6 +1714,21 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MFI->isEntryFunction()) return; + if (MFI->isWholeWaveFunction()) { + // In practice, all the VGPRs are WWM registers, and we will need to save at + // least their inactive lanes. Add them to WWMReservedRegs. + assert(!NeedExecCopyReservedReg && + "Whole wave functions can use the reg mapped for their i1 argument"); + + // FIXME: Be more efficient! + for (MCRegister Reg : AMDGPU::VGPR_32RegClass) + if (MF.getRegInfo().isPhysRegModified(Reg)) { + MFI->reserveWWMRegister(Reg); + MF.begin()->addLiveIn(Reg); + } + MF.begin()->sortUniqueLiveIns(); + } + // Remove any VGPRs used in the return value because these do not need to be saved. // This prevents CSR restore from clobbering return VGPRs. if (ReturnMI) { |