llvm/lib/Target/AMDGPU/AMDGPULowerVGPREncoding.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373

//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Lower VGPRs above first 256 on gfx1250.
///
/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
/// VGPR addressing mode. The mode change is effective until the next change.
/// This instruction provides high bits of a VGPR address for four of the
/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
/// instruction encoding. If bits are set they are added as MSB to the
/// corresponding operand VGPR number.
///
/// There is no need to replace actual register operands because encoding of the
/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
/// VGPRs will survive until actual encoding and will result in a same actual
/// bit encoding.
///
/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
/// to a VGPR address of the subseqent instructions. The InstPrinter will take
/// care of the printing a low VGPR instead of a high one. In prinicple this
/// shall be viable to print actual high VGPR numbers, but that would disagree
/// with a disasm printing and create a situation where asm text is not
/// deterministic.
///
/// This pass creates a convention where non-fall through basic blocks shall
/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
/// An optimization here is possible but deemed not desirable because of the
/// readbility concerns.
///
/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
/// The pass must run very late in the pipeline to make sure no changes to VGPR
/// operands will be made after it.
//
//===----------------------------------------------------------------------===//

#include "AMDGPULowerVGPREncoding.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/PackedVector.h"

using namespace llvm;

#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"

namespace {

class AMDGPULowerVGPREncoding {
  static constexpr unsigned OpNum = 4;
  static constexpr unsigned BitsPerField = 2;
  static constexpr unsigned NumFields = 4;
  static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
  using ModeType = PackedVector<unsigned, BitsPerField,
                                std::bitset<BitsPerField * NumFields>>;

  class ModeTy : public ModeType {
  public:
    // bitset constructor will set all bits to zero
    ModeTy() : ModeType(0) {}

    operator int64_t() const { return raw_bits().to_ulong(); }

    static ModeTy fullMask() {
      ModeTy M;
      M.raw_bits().flip();
      return M;
    }
  };

public:
  bool run(MachineFunction &MF);

private:
  const SIInstrInfo *TII;
  const SIRegisterInfo *TRI;

  /// Most recent s_set_* instruction.
  MachineInstr *MostRecentModeSet;

  /// Whether the current mode is known.
  bool CurrentModeKnown;

  /// Current mode bits.
  ModeTy CurrentMode;

  /// Current mask of mode bits that instructions since MostRecentModeSet care
  /// about.
  ModeTy CurrentMask;

  /// Number of current hard clause instructions.
  unsigned ClauseLen;

  /// Number of hard clause instructions remaining.
  unsigned ClauseRemaining;

  /// Clause group breaks.
  unsigned ClauseBreaks;

  /// Last hard clause instruction.
  MachineInstr *Clause;

  /// Insert mode change before \p I. \returns true if mode was changed.
  bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);

  /// Reset mode to default.
  void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }

  /// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
  std::optional<unsigned> getMSBs(const MachineOperand &MO) const;

  /// Handle single \p MI. \return true if changed.
  bool runOnMachineInstr(MachineInstr &MI);

  /// Compute the mode and mode mask for a single \p MI given \p Ops operands
  /// bit mapping. Optionally takes second array \p Ops2 for VOPD.
  /// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
  /// is checked.
  void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
                   const AMDGPU::OpName Ops[OpNum],
                   const AMDGPU::OpName *Ops2 = nullptr);

  /// Check if an instruction \p I is within a clause and returns a suitable
  /// iterator to insert mode change. It may also modify the S_CLAUSE
  /// instruction to extend it or drop the clause if it cannot be adjusted.
  MachineInstr *handleClause(MachineInstr *I);
};

bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
                                      MachineInstr *I) {
  assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());

  if (CurrentModeKnown) {
    auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();

    if ((Delta & Mask.raw_bits()).none()) {
      CurrentMask |= Mask;
      return false;
    }

    if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
      CurrentMode |= NewMode;
      CurrentMask |= Mask;

      MostRecentModeSet->getOperand(0).setImm(CurrentMode);
      return true;
    }
  }

  I = handleClause(I);
  MostRecentModeSet =
      BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
          .addImm(NewMode);

  CurrentMode = NewMode;
  CurrentMask = Mask;
  CurrentModeKnown = true;
  return true;
}

std::optional<unsigned>
AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
  if (!MO.isReg())
    return std::nullopt;

  MCRegister Reg = MO.getReg();
  const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
  if (!RC || !TRI->isVGPRClass(RC))
    return std::nullopt;

  unsigned Idx = TRI->getHWRegIndex(Reg);
  return Idx >> 8;
}

void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
                                          MachineInstr &MI,
                                          const AMDGPU::OpName Ops[OpNum],
                                          const AMDGPU::OpName *Ops2) {
  NewMode = {};
  Mask = {};

  for (unsigned I = 0; I < OpNum; ++I) {
    MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);

    std::optional<unsigned> MSBits;
    if (Op)
      MSBits = getMSBs(*Op);

#if !defined(NDEBUG)
    if (MSBits.has_value() && Ops2) {
      auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
      if (Op2) {
        std::optional<unsigned> MSBits2;
        MSBits2 = getMSBs(*Op2);
        if (MSBits2.has_value() && MSBits != MSBits2)
          llvm_unreachable("Invalid VOPD pair was created");
      }
    }
#endif

    if (!MSBits.has_value() && Ops2) {
      Op = TII->getNamedOperand(MI, Ops2[I]);
      if (Op)
        MSBits = getMSBs(*Op);
    }

    if (!MSBits.has_value())
      continue;

    // Skip tied uses of src2 of VOP2, these will be handled along with defs and
    // only vdst bit affects these operands. We cannot skip tied uses of VOP3,
    // these uses are real even if must match the vdst.
    if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
        (SIInstrInfo::isVOP2(MI) ||
         (SIInstrInfo::isVOP3(MI) &&
          TII->hasVALU32BitEncoding(MI.getOpcode()))))
      continue;

    NewMode[I] = MSBits.value();
    Mask[I] = FieldMask;
  }
}

bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
  auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
  if (Ops.first) {
    ModeTy NewMode, Mask;
    computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
    return setMode(NewMode, Mask, &MI);
  }
  assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());

  return false;
}

MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
  if (!ClauseRemaining)
    return I;

  // A clause cannot start with a special instruction, place it right before
  // the clause.
  if (ClauseRemaining == ClauseLen) {
    I = Clause->getPrevNode();
    assert(I->isBundle());
    return I;
  }

  // If a clause defines breaks each group cannot start with a mode change.
  // just drop the clause.
  if (ClauseBreaks) {
    Clause->eraseFromBundle();
    ClauseRemaining = 0;
    return I;
  }

  // Otherwise adjust a number of instructions in the clause if it fits.
  // If it does not clause will just become shorter. Since the length
  // recorded in the clause is one less, increment the length after the
  // update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
  if (ClauseLen < 63)
    Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));

  ++ClauseLen;

  return I;
}

bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  if (!ST.has1024AddressableVGPRs())
    return false;

  TII = ST.getInstrInfo();
  TRI = ST.getRegisterInfo();

  bool Changed = false;
  ClauseLen = ClauseRemaining = 0;
  CurrentMode.reset();
  CurrentMask.reset();
  CurrentModeKnown = true;
  for (auto &MBB : MF) {
    MostRecentModeSet = nullptr;

    for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
      if (MI.isMetaInstruction())
        continue;

      if (MI.isTerminator() || MI.isCall()) {
        if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
            MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
          CurrentMode.reset();
          CurrentModeKnown = true;
        } else
          resetMode(&MI);
        continue;
      }

      if (MI.isInlineAsm()) {
        if (TII->hasVGPRUses(MI))
          resetMode(&MI);
        continue;
      }

      if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
        assert(!ClauseRemaining && "Nested clauses are not supported");
        ClauseLen = MI.getOperand(0).getImm();
        ClauseBreaks = (ClauseLen >> 8) & 15;
        ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
        Clause = &MI;
        continue;
      }

      Changed |= runOnMachineInstr(MI);

      if (ClauseRemaining)
        --ClauseRemaining;
    }

    // If we're falling through to a block that has at least one other
    // predecessor, we no longer know the mode.
    MachineBasicBlock *Next = MBB.getNextNode();
    if (Next && Next->pred_size() >= 2 &&
        llvm::is_contained(Next->predecessors(), &MBB)) {
      if (CurrentMode.raw_bits().any())
        CurrentModeKnown = false;
    }
  }

  return Changed;
}

class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
public:
  static char ID;

  AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}

  bool runOnMachineFunction(MachineFunction &MF) override {
    return AMDGPULowerVGPREncoding().run(MF);
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    MachineFunctionPass::getAnalysisUsage(AU);
  }
};

} // namespace

char AMDGPULowerVGPREncodingLegacy::ID = 0;

char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;

INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
                "AMDGPU Lower VGPR Encoding", false, false)

PreservedAnalyses
AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
                                 MachineFunctionAnalysisManager &MFAM) {
  if (!AMDGPULowerVGPREncoding().run(MF))
    return PreservedAnalyses::all();

  PreservedAnalyses PA;
  PA.preserveSet<CFGAnalyses>();
  return PA;
}