diff options
| -rw-r--r-- | flang/lib/Optimizer/Transforms/CUFOpConversion.cpp | 70 | ||||
| -rw-r--r-- | flang/test/Fir/CUDA/cuda-launch.fir | 64 |
2 files changed, 132 insertions, 2 deletions
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 14cc1cb5..fe125db 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -15,6 +15,7 @@ #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Support/DataLayout.h" +#include "flang/Optimizer/Transforms/CUFCommon.h" #include "flang/Runtime/CUDA/allocatable.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" @@ -620,6 +621,69 @@ private: const mlir::SymbolTable &symtab; }; +struct CUFLaunchOpConversion + : public mlir::OpRewritePattern<cuf::KernelLaunchOp> { +public: + using OpRewritePattern::OpRewritePattern; + + CUFLaunchOpConversion(mlir::MLIRContext *context, + const mlir::SymbolTable &symTab) + : OpRewritePattern(context), symTab{symTab} {} + + mlir::LogicalResult + matchAndRewrite(cuf::KernelLaunchOp op, + mlir::PatternRewriter &rewriter) const override { + mlir::Location loc = op.getLoc(); + auto idxTy = mlir::IndexType::get(op.getContext()); + auto zero = rewriter.create<mlir::arith::ConstantOp>( + loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0)); + auto gridSizeX = + rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX()); + auto gridSizeY = + rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridY()); + auto gridSizeZ = + rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridZ()); + auto blockSizeX = + rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockX()); + auto blockSizeY = + rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockY()); + auto blockSizeZ = + rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockZ()); + auto kernelName = mlir::SymbolRefAttr::get( + rewriter.getStringAttr(cudaDeviceModuleName), + {mlir::SymbolRefAttr::get( + rewriter.getContext(), + op.getCallee().getLeafReference().getValue())}); + mlir::Value clusterDimX, clusterDimY, clusterDimZ; + if (auto funcOp = symTab.lookup<mlir::func::FuncOp>( + op.getCallee().getLeafReference())) { + if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>( + cuf::getClusterDimsAttrName())) { + clusterDimX = rewriter.create<mlir::arith::ConstantIndexOp>( + loc, clusterDimsAttr.getX().getInt()); + clusterDimY = rewriter.create<mlir::arith::ConstantIndexOp>( + loc, clusterDimsAttr.getY().getInt()); + clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>( + loc, clusterDimsAttr.getZ().getInt()); + } + } + auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>( + loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ}, + mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero, + op.getArgs()); + if (clusterDimX && clusterDimY && clusterDimZ) { + gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX); + gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY); + gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ); + } + rewriter.replaceOp(op, gpuLaunchOp); + return mlir::success(); + } + +private: + const mlir::SymbolTable &symTab; +}; + class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> { public: void runOnOperation() override { @@ -637,7 +701,8 @@ public: fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false); fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false, /*forceUnifiedTBAATree=*/false, *dl); - target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect>(); + target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect, + mlir::gpu::GPUDialect>(); cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab, patterns); if (mlir::failed(mlir::applyPartialConversion(getOperation(), target, @@ -656,5 +721,6 @@ void cuf::populateCUFToFIRConversionPatterns( patterns.insert<CufAllocOpConversion>(patterns.getContext(), &dl, &converter); patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion, CufFreeOpConversion>(patterns.getContext()); - patterns.insert<CufDataTransferOpConversion>(patterns.getContext(), symtab); + patterns.insert<CufDataTransferOpConversion, CUFLaunchOpConversion>( + patterns.getContext(), symtab); } diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir new file mode 100644 index 0000000..f11bcbd --- /dev/null +++ b/flang/test/Fir/CUDA/cuda-launch.fir @@ -0,0 +1,64 @@ +// RUN: fir-opt --split-input-file --cuf-convert %s | FileCheck %s + + +module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { + gpu.module @cuda_device_mod { + gpu.func @_QPsub_device1() kernel { + cf.br ^bb1 + ^bb1: // pred: ^bb0 + gpu.return + } + gpu.func @_QPsub_device2(%arg0: !fir.ref<f32>) kernel { + cf.br ^bb1(%arg0 : !fir.ref<f32>) + ^bb1(%0: !fir.ref<f32>): // pred: ^bb0 + %1 = fir.declare %0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32> + %cst = arith.constant 2.000000e+00 : f32 + fir.store %cst to %1 : !fir.ref<f32> + gpu.return + } + } + + func.func @_QQmain() attributes {fir.bindc_name = "main"} { + %0 = fir.alloca f32 + // CHECK: %[[ALLOCA:.*]] = fir.alloca f32 + %c1 = arith.constant 1 : index + %c11_i32 = arith.constant 11 : i32 + %c6_i32 = arith.constant 6 : i32 + %c1_i32 = arith.constant 1 : i32 + // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} + cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>() + + // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>) + cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>) + return + } + +} + +// ----- + +module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { + gpu.module @cuda_device_mod { + gpu.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>>) kernel { + gpu.return + } + } + + func.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "adev"}) attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} { + return + } + func.func @_QMmod1Phost_sub() { + %c10 = arith.constant 10 : index + %0 = cuf.alloc !fir.array<10xi32> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} -> !fir.ref<!fir.array<10xi32>> + %1 = fir.shape %c10 : (index) -> !fir.shape<1> + %2:2 = hlfir.declare %0(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>) + %c1_i32 = arith.constant 1 : i32 + %c10_i32 = arith.constant 10 : i32 + cuf.kernel_launch @_QMmod1Psub1<<<%c1_i32, %c1_i32, %c1_i32, %c10_i32, %c1_i32, %c1_i32>>>(%2#1) : (!fir.ref<!fir.array<10xi32>>) + return + } +} + +// CHECK-LABEL: func.func @_QMmod1Phost_sub() +// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}}) + |
