llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

//===-- GCNPreRALongBranchReg.cpp ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// \file
// \brief Pass to estimate pre RA branch size and reserve a pair of SGPRs if
// there is a long branch. Branch size at this point is difficult to track since
// we have no idea what spills will be inserted later on. We just assume 8 bytes
// per instruction to compute approximations without computing the actual
// instruction size to see if we're in the neighborhood of the maximum branch
// distrance threshold tuning of what is considered "long" is handled through
// amdgpu-long-branch-factor cl argument which sets LongBranchFactor.
//===----------------------------------------------------------------------===//
#include "GCNPreRALongBranchReg.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"

using namespace llvm;

#define DEBUG_TYPE "amdgpu-pre-ra-long-branch-reg"

namespace {

static cl::opt<double> LongBranchFactor(
    "amdgpu-long-branch-factor", cl::init(1.0), cl::Hidden,
    cl::desc("Factor to apply to what qualifies as a long branch "
             "to reserve a pair of scalar registers. If this value "
             "is 0 the long branch registers are never reserved. As this "
             "value grows the greater chance the branch distance will fall "
             "within the threshold and the registers will be marked to be "
             "reserved. We lean towards always reserving a register for  "
             "long jumps"));

class GCNPreRALongBranchReg {

  struct BasicBlockInfo {
    // Offset - Distance from the beginning of the function to the beginning
    // of this basic block.
    uint64_t Offset = 0;
    // Size - Size of the basic block in bytes
    uint64_t Size = 0;
  };
  void generateBlockInfo(MachineFunction &MF,
                         SmallVectorImpl<BasicBlockInfo> &BlockInfo);

public:
  GCNPreRALongBranchReg() = default;
  bool run(MachineFunction &MF);
};

class GCNPreRALongBranchRegLegacy : public MachineFunctionPass {
public:
  static char ID;
  GCNPreRALongBranchRegLegacy() : MachineFunctionPass(ID) {
    initializeGCNPreRALongBranchRegLegacyPass(*PassRegistry::getPassRegistry());
  }

  bool runOnMachineFunction(MachineFunction &MF) override {
    return GCNPreRALongBranchReg().run(MF);
  }

  StringRef getPassName() const override {
    return "AMDGPU Pre-RA Long Branch Reg";
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesAll();
    MachineFunctionPass::getAnalysisUsage(AU);
  }
};
} // End anonymous namespace.

char GCNPreRALongBranchRegLegacy::ID = 0;

INITIALIZE_PASS(GCNPreRALongBranchRegLegacy, DEBUG_TYPE,
                "AMDGPU Pre-RA Long Branch Reg", false, false)

char &llvm::GCNPreRALongBranchRegID = GCNPreRALongBranchRegLegacy::ID;
void GCNPreRALongBranchReg::generateBlockInfo(
    MachineFunction &MF, SmallVectorImpl<BasicBlockInfo> &BlockInfo) {

  BlockInfo.resize(MF.getNumBlockIDs());

  // Approximate the size of all basic blocks by just
  // assuming 8 bytes per instruction
  for (const MachineBasicBlock &MBB : MF) {
    uint64_t NumInstr = 0;
    // Loop through the basic block and add up all non-debug
    // non-meta instructions
    for (const MachineInstr &MI : MBB) {
      // isMetaInstruction is a superset of isDebugIstr
      if (MI.isMetaInstruction())
        continue;
      NumInstr += 1;
    }
    // Approximate size as just 8 bytes per instruction
    BlockInfo[MBB.getNumber()].Size = 8 * NumInstr;
  }
  uint64_t PrevNum = (&MF)->begin()->getNumber();
  for (auto &MBB :
       make_range(std::next(MachineFunction::iterator((&MF)->begin())),
                  (&MF)->end())) {
    uint64_t Num = MBB.getNumber();
    // Compute the offset immediately following this block.
    BlockInfo[Num].Offset = BlockInfo[PrevNum].Offset + BlockInfo[PrevNum].Size;
    PrevNum = Num;
  }
}

bool GCNPreRALongBranchReg::run(MachineFunction &MF) {
  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
  const SIInstrInfo *TII = STM.getInstrInfo();
  const SIRegisterInfo *TRI = STM.getRegisterInfo();
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  MachineRegisterInfo &MRI = MF.getRegInfo();

  // For now, reserve highest available SGPR pair. After RA,
  // shift down to a lower unused pair of SGPRs
  // If all registers are used, then findUnusedRegister will return
  // AMDGPU::NoRegister.
  constexpr bool ReserveHighestRegister = true;
  Register LongBranchReservedReg = TRI->findUnusedRegister(
      MRI, &AMDGPU::SGPR_64RegClass, MF, ReserveHighestRegister);
  if (!LongBranchReservedReg)
    return false;

  // Approximate code size and offsets of each basic block
  SmallVector<BasicBlockInfo, 16> BlockInfo;
  generateBlockInfo(MF, BlockInfo);

  for (const MachineBasicBlock &MBB : MF) {
    MachineBasicBlock::const_iterator Last = MBB.getLastNonDebugInstr();
    if (Last == MBB.end() || !Last->isUnconditionalBranch())
      continue;
    MachineBasicBlock *DestBB = TII->getBranchDestBlock(*Last);
    uint64_t BlockDistance = static_cast<uint64_t>(
        LongBranchFactor * BlockInfo[DestBB->getNumber()].Offset);
    // If the distance falls outside the threshold assume it is a long branch
    // and we need to reserve the registers
    if (!TII->isBranchOffsetInRange(Last->getOpcode(), BlockDistance)) {
      MFI->setLongBranchReservedReg(LongBranchReservedReg);
      return true;
    }
  }
  return false;
}

PreservedAnalyses
GCNPreRALongBranchRegPass::run(MachineFunction &MF,
                               MachineFunctionAnalysisManager &MFAM) {
  GCNPreRALongBranchReg().run(MF);
  return PreservedAnalyses::all();
}