mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

//===- DistributionUtils.cpp - Distribution tools for GPUOps --------------===//
//
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements distribution utility methods.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/IR/Value.h"
#include "llvm/ADT/DenseMap.h"

#include <numeric>

using namespace mlir;
using namespace mlir::gpu;

WarpExecuteOnLane0Op
WarpDistributionPattern::moveRegionToNewWarpOpAndReplaceReturns(
    RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
    ValueRange newYieldedValues, TypeRange newReturnTypes) const {
  // Create a new op before the existing one, with the extra operands.
  OpBuilder::InsertionGuard g(rewriter);
  rewriter.setInsertionPoint(warpOp);
  auto newWarpOp = WarpExecuteOnLane0Op::create(
      rewriter, warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(),
      warpOp.getWarpSize(), warpOp.getArgs(),
      warpOp.getBody()->getArgumentTypes());

  Region &opBody = warpOp.getBodyRegion();
  Region &newOpBody = newWarpOp.getBodyRegion();
  Block &newOpFirstBlock = newOpBody.front();
  rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
  rewriter.eraseBlock(&newOpFirstBlock);
  assert(newWarpOp.getWarpRegion().hasOneBlock() &&
         "expected WarpOp with single block");

  auto yield =
      cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());

  rewriter.modifyOpInPlace(
      yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
  return newWarpOp;
}

WarpExecuteOnLane0Op
WarpDistributionPattern::moveRegionToNewWarpOpAndAppendReturns(
    RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
    ValueRange newYieldedValues, TypeRange newReturnTypes,
    SmallVector<size_t> &indices) const {
  SmallVector<Type> types(warpOp.getResultTypes().begin(),
                          warpOp.getResultTypes().end());
  gpu::YieldOp yield = warpOp.getTerminator();
  SmallVector<Value> yieldValues(yield.getOperands().begin(),
                                 yield.getOperands().end());
  llvm::SmallDenseMap<Value, unsigned> indexLookup;
  // Record the value -> first index mapping for faster lookup.
  for (auto [i, v] : llvm::enumerate(yieldValues)) {
    if (!indexLookup.count(v))
      indexLookup[v] = i;
  }

  for (auto [value, type] : llvm::zip_equal(newYieldedValues, newReturnTypes)) {
    // If the value already exists in the yield, don't create a new output.
    if (indexLookup.count(value)) {
      indices.push_back(indexLookup[value]);
    } else {
      // If the value is new, add it to the yield and to the types.
      yieldValues.push_back(value);
      types.push_back(type);
      indices.push_back(yieldValues.size() - 1);
    }
  }

  WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
      rewriter, warpOp, yieldValues, types);
  rewriter.replaceOp(warpOp,
                     newWarpOp.getResults().take_front(warpOp.getNumResults()));
  return newWarpOp;
}

OpOperand *WarpDistributionPattern::getWarpResult(
    WarpExecuteOnLane0Op warpOp,
    llvm::function_ref<bool(Operation *)> fn) const {
  gpu::YieldOp yield = warpOp.getTerminator();
  for (OpOperand &yieldOperand : yield->getOpOperands()) {
    Value yieldValues = yieldOperand.get();
    Operation *definedOp = yieldValues.getDefiningOp();
    if (definedOp && fn(definedOp)) {
      if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
        return &yieldOperand;
    }
  }
  return nullptr;
}

bool WarpDistributionPattern::delinearizeLaneId(
    OpBuilder &builder, Location loc, ArrayRef<int64_t> originalShape,
    ArrayRef<int64_t> distributedShape, int64_t warpSize, Value laneId,
    SmallVectorImpl<Value> &delinearizedIds) const {
  // If the original shape and the distributed shape is the same, we don't
  // distribute at all--every thread is handling the whole. For such case, we
  // should not rely on lane IDs later. So just return an empty lane ID vector.
  if (originalShape == distributedShape) {
    delinearizedIds.clear();
    return true;
  }

  SmallVector<int64_t> sizes;
  for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
    if (large % small != 0)
      return false;
    sizes.push_back(large / small);
  }
  if (std::accumulate(sizes.begin(), sizes.end(), 1,
                      std::multiplies<int64_t>()) != warpSize)
    return false;

  AffineExpr s0, s1;
  bindSymbols(builder.getContext(), s0, s1);

  int64_t usedThreads = 1;

  Value zero = arith::ConstantIndexOp::create(builder, loc, 0);
  delinearizedIds.assign(sizes.size(), zero);

  for (int i = sizes.size() - 1; i >= 0; --i) {
    usedThreads *= sizes[i];
    if (usedThreads == warpSize) {
      // We've used up all available threads. Don't need to perform modulo
      // anymore. And we can stop the calculation for further dimensions.
      delinearizedIds[i] = laneId;
      break;
    }
    delinearizedIds[i] =
        affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
    laneId = affine::makeComposedAffineApply(
        builder, loc, s0.floorDiv(usedThreads), {laneId});
  }
  return true;
}