flang/lib/Optimizer/Transforms/CUDA/CUFPredefinedVarToGPU.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

//===-- CUFPredefinedVarToGPU.cpp -----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Pass/Pass.h"

namespace fir {
#define GEN_PASS_DEF_CUFPREDEFINEDVARTOGPU
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir

using namespace mlir;

namespace {

template <typename OpTyX, typename OpTyY, typename OpTyZ>
static void createForAllDimensions(mlir::OpBuilder &builder, mlir::Location loc,
                                   mlir::Value c1,
                                   SmallVectorImpl<mlir::Value> &values,
                                   bool incrementByOne = false) {
  if (incrementByOne) {
    auto baseX = OpTyX::create(builder, loc, builder.getI32Type());
    values.push_back(mlir::arith::AddIOp::create(builder, loc, baseX, c1));
    auto baseY = OpTyY::create(builder, loc, builder.getI32Type());
    values.push_back(mlir::arith::AddIOp::create(builder, loc, baseY, c1));
    auto baseZ = OpTyZ::create(builder, loc, builder.getI32Type());
    values.push_back(mlir::arith::AddIOp::create(builder, loc, baseZ, c1));
  } else {
    values.push_back(OpTyX::create(builder, loc, builder.getI32Type()));
    values.push_back(OpTyY::create(builder, loc, builder.getI32Type()));
    values.push_back(OpTyZ::create(builder, loc, builder.getI32Type()));
  }
}

static constexpr llvm::StringRef builtinsModuleName = "__fortran_builtins";
static constexpr llvm::StringRef builtinVarPrefix = "__builtin_";
static constexpr llvm::StringRef threadidx = "threadidx";
static constexpr llvm::StringRef blockidx = "blockidx";
static constexpr llvm::StringRef blockdim = "blockdim";
static constexpr llvm::StringRef griddim = "griddim";

static constexpr unsigned field_x = 0;
static constexpr unsigned field_y = 1;
static constexpr unsigned field_z = 2;

std::string mangleBuiltin(llvm::StringRef varName) {
  return "_QM" + builtinsModuleName.str() + "E" + builtinVarPrefix.str() +
         varName.str();
}

static void processCoordinateOp(mlir::OpBuilder &builder, mlir::Location loc,
                                fir::CoordinateOp coordOp, unsigned fieldIdx,
                                mlir::Value &gpuValue) {
  std::optional<llvm::ArrayRef<int32_t>> fieldIndices =
      coordOp.getFieldIndices();
  assert(fieldIndices && fieldIndices->size() == 1 &&
         "expect only one coordinate");
  if (static_cast<unsigned>((*fieldIndices)[0]) == fieldIdx) {
    llvm::SmallVector<fir::LoadOp> opToErase;
    for (mlir::OpOperand &coordUse : coordOp.getResult().getUses()) {
      assert(mlir::isa<fir::LoadOp>(coordUse.getOwner()) &&
             "only expect load op");
      auto loadOp = mlir::dyn_cast<fir::LoadOp>(coordUse.getOwner());
      loadOp.getResult().replaceAllUsesWith(gpuValue);
      opToErase.push_back(loadOp);
    }
    for (auto op : opToErase)
      op.erase();
  }
}

static void
processDeclareOp(mlir::OpBuilder &builder, mlir::Location loc,
                 fir::DeclareOp declareOp, llvm::StringRef builtinVar,
                 llvm::SmallVectorImpl<mlir::Value> &gpuValues,
                 llvm::SmallVectorImpl<mlir::Operation *> &opsToDelete) {
  if (declareOp.getUniqName().str().compare(builtinVar) == 0) {
    for (mlir::OpOperand &use : declareOp.getResult().getUses()) {
      fir::CoordinateOp coordOp =
          mlir::dyn_cast<fir::CoordinateOp>(use.getOwner());
      processCoordinateOp(builder, loc, coordOp, field_x, gpuValues[0]);
      processCoordinateOp(builder, loc, coordOp, field_y, gpuValues[1]);
      processCoordinateOp(builder, loc, coordOp, field_z, gpuValues[2]);
      opsToDelete.push_back(coordOp);
    }
    opsToDelete.push_back(declareOp.getOperation());
    if (declareOp.getMemref().getDefiningOp())
      opsToDelete.push_back(declareOp.getMemref().getDefiningOp());
  }
}

struct CUFPredefinedVarToGPU
    : public fir::impl::CUFPredefinedVarToGPUBase<CUFPredefinedVarToGPU> {

  void runOnOperation() override {
    func::FuncOp funcOp = getOperation();
    if (funcOp.getBody().empty())
      return;

    if (auto cudaProcAttr =
            funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
                cuf::getProcAttrName())) {
      if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device ||
          cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
          cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal ||
          cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice) {
        mlir::Location loc = funcOp.getLoc();
        mlir::OpBuilder builder(funcOp.getContext());
        builder.setInsertionPointToStart(&funcOp.getBody().front());
        auto c1 = mlir::arith::ConstantOp::create(
            builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
        llvm::SmallVector<mlir::Value, 3> threadids, blockids, blockdims,
            griddims;
        createForAllDimensions<mlir::NVVM::ThreadIdXOp, mlir::NVVM::ThreadIdYOp,
                               mlir::NVVM::ThreadIdZOp>(
            builder, loc, c1, threadids, /*incrementByOne=*/true);
        createForAllDimensions<mlir::NVVM::BlockIdXOp, mlir::NVVM::BlockIdYOp,
                               mlir::NVVM::BlockIdZOp>(
            builder, loc, c1, blockids, /*incrementByOne=*/true);
        createForAllDimensions<mlir::NVVM::GridDimXOp, mlir::NVVM::GridDimYOp,
                               mlir::NVVM::GridDimZOp>(builder, loc, c1,
                                                       griddims);
        createForAllDimensions<mlir::NVVM::BlockDimXOp, mlir::NVVM::BlockDimYOp,
                               mlir::NVVM::BlockDimZOp>(builder, loc, c1,
                                                        blockdims);

        llvm::SmallVector<mlir::Operation *> opsToDelete;
        for (auto declareOp : funcOp.getOps<fir::DeclareOp>()) {
          processDeclareOp(builder, loc, declareOp, mangleBuiltin(threadidx),
                           threadids, opsToDelete);
          processDeclareOp(builder, loc, declareOp, mangleBuiltin(blockidx),
                           blockids, opsToDelete);
          processDeclareOp(builder, loc, declareOp, mangleBuiltin(blockdim),
                           blockdims, opsToDelete);
          processDeclareOp(builder, loc, declareOp, mangleBuiltin(griddim),
                           griddims, opsToDelete);
        }

        for (auto op : opsToDelete)
          op->erase();
      }
    }
  }
};

} // end anonymous namespace