diff options
author | Fabian Mora <fmora.dev@gmail.com> | 2023-09-09 11:42:01 +0000 |
---|---|---|
committer | Fabian Mora <fmora.dev@gmail.com> | 2023-09-09 12:45:21 +0000 |
commit | 119c489cc1a026feafe67c2b58c34b40dc2b6bd0 (patch) | |
tree | 4d41d329b913d46a6b4ddeb50266a9ac7c04d586 | |
parent | 2374ae4362848d92963fac042e8a3e94b9e57e0b (diff) | |
download | llvm-119c489cc1a026feafe67c2b58c34b40dc2b6bd0.zip llvm-119c489cc1a026feafe67c2b58c34b40dc2b6bd0.tar.gz llvm-119c489cc1a026feafe67c2b58c34b40dc2b6bd0.tar.bz2 |
Reland [mlir][test][gpu] Migrate CUDA tests to the TargetAttr compilation workflow (llvm#65768)
The revert happened due to a build bot failure that threw 'CUDA_ERROR_UNSUPPORTED_PTX_VERSION'.
The failure's root cause was a pass using "+ptx76" for compilation and an old CUDA driver
on the bot. This commit relands the patch with "+ptx60".
Original Gh PR: #65768
Original commit message:
Migrate tests referencing `gpu-to-cubin` to the new compilation workflow
using `TargetAttrs`. The `test-lower-to-nvvm` pass pipeline was modified
to use the new compilation workflow to simplify the introduction of
future tests.
The `createLowerGpuOpsToNVVMOpsPass` function was removed, as it didn't
allow for passing all options available in the `ConvertGpuOpsToNVVMOp`
pass.
25 files changed, 63 insertions, 133 deletions
diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h index 46f29c6..e0f4c71 100644 --- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h +++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h @@ -16,9 +16,7 @@ namespace mlir { class LLVMTypeConverter; class ConversionTarget; class RewritePatternSet; - -template <typename OpT> -class OperationPass; +class Pass; namespace gpu { class GPUModuleOp; @@ -45,14 +43,6 @@ void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter, /// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM. void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns); - -/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The -/// index bitwidth used for the lowering of the device side index computations -/// is configurable. -std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass( - unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, - bool hasRedux = false); - } // namespace mlir #endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index ed37abf..3218760 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -486,7 +486,6 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> { def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> { let summary = "Generate NVVM operations for gpu operations"; - let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()"; let dependentDialects = [ "cf::ControlFlowDialect", "memref::MemRefDialect", diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 06469dc..764b6a7 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -210,11 +210,7 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> { /// code. struct LowerGpuOpsToNVVMOpsPass : public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> { - LowerGpuOpsToNVVMOpsPass() = default; - LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) { - this->indexBitwidth = indexBitwidth; - this->hasRedux = hasRedux; - } + using Base::Base; void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); @@ -378,8 +374,3 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, "__nv_tanh"); populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan"); } - -std::unique_ptr<OperationPass<gpu::GPUModuleOp>> -mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) { - return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth, hasRedux); -} diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp index a7fd5a2..24c4c4c4 100644 --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -52,7 +52,7 @@ void mlir::sparse_tensor::buildSparseCompiler( pm.addPass(createSparseGPUCodegenPass()); pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass()); pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass()); - pm.addNestedPass<gpu::GPUModuleOp>(createLowerGpuOpsToNVVMOpsPass()); + pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps()); } // TODO(springerm): Add sparse support to the BufferDeallocation pass and add diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir index 2c1ae3e..0cb06b7 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir @@ -1,6 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \ +// RUN: | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \ // RUN: 2>&1 | FileCheck %s // CHECK: Generated by LLVM NVPTX Back-End diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir index 8eb90fd..80972f2 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir @@ -2,10 +2,9 @@ // NOTE: this test requires gpu-sm80 // // RUN: mlir-opt \ -// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \ +// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \ // RUN: %s \ -// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ -// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ +// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir index 8571c5ca..8c99149 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir @@ -1,9 +1,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\ // RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\ // RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \ -// RUN: -convert-arith-to-llvm -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -convert-arith-to-llvm -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir index c671c18..f26c18c 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir @@ -2,9 +2,7 @@ // everything on the same thread. // RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ @@ -15,9 +13,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \ // RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ @@ -27,9 +23,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \ // RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir index 535ba52..591bf1b 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir index c4ca465..51bd23f 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir @@ -3,9 +3,7 @@ // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention. // This test also uses gpu.memcpy operations (instead of gpu.host_register). // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \ +// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir index ae410dc..0307b3d 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir index f4324a1..b131b86 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -10,9 +8,7 @@ // Same as above but with the memref bare pointer lowering convention. // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \ +// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir index 0a8d38f..155423db 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir index bcd785d..e5047b6 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir index aa4b0e8..163e9fd 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir index 2e7d046..381db26 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir index 32cfa27..23c6c11 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir index 30767b9..3c5a100 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir index e6dd91a..d2a5127 100644 --- a/mlir/test/Integration/GPU/CUDA/async.mlir +++ b/mlir/test/Integration/GPU/CUDA/async.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \ +// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \ // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \ // RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \ // RUN: | mlir-cpu-runner \ diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index afcb674..a5d04f73 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -1,8 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir index 444e287..7657bf4 100644 --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index fce7739..1a35d1e 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -1,6 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir index 6a784ca..40fcea8 100644 --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir index 5f6e5d7..5a9acdf 100644 --- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir +++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp index 5db6f56..5d0c420 100644 --- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp +++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp @@ -65,11 +65,11 @@ struct TestLowerToNVVMOptions llvm::cl::init("nvptx64-nvidia-cuda")}; PassOptions::Option<std::string> cubinChip{ *this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."), - llvm::cl::init("sm_80")}; + llvm::cl::init("sm_50")}; PassOptions::Option<std::string> cubinFeatures{ *this, "cubin-features", llvm::cl::desc("Features to use to serialize to cubin."), - llvm::cl::init("+ptx76")}; + llvm::cl::init("+ptx60")}; }; //===----------------------------------------------------------------------===// @@ -126,13 +126,14 @@ void buildGpuPassPipeline(OpPassManager &pm, // TODO: C++20 designated initializers. // The following pass is inconsistent. - // ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; - // convertGpuOpsToNVVMOpsOptions.indexBitwidth = - // options.kernelIndexBitWidth; + // TODO: fix inconsistence. + ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; + convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv = + options.kernelUseBarePtrCallConv; + convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth; + convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true; pm.addNestedPass<gpu::GPUModuleOp>( - // TODO: fix inconsistence. - createLowerGpuOpsToNVVMOpsPass(/*indexBitWidth=*/ - options.kernelIndexBitWidth)); + createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions)); // TODO: C++20 designated initializers. ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; @@ -141,22 +142,6 @@ void buildGpuPassPipeline(OpPassManager &pm, createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass()); - // TODO: C++20 designated initializers. - GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; - // Note: hostBarePtrCallConv must be false for now otherwise - // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't - // lower the to bare ptr. - gpuToLLVMConversionOptions.hostBarePtrCallConv = - options.hostUseBarePtrCallConv; - gpuToLLVMConversionOptions.kernelBarePtrCallConv = - options.kernelUseBarePtrCallConv; - gpuToLLVMConversionOptions.useOpaquePointers = true; - - // TODO: something useful here. - // gpuToLLVMConversionOptions.gpuBinaryAnnotation = ""; - pm.addNestedPass<gpu::GPUModuleOp>( - createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); - // Convert vector to LLVM (always needed). // TODO: C++20 designated initializers. ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; @@ -170,11 +155,6 @@ void buildGpuPassPipeline(OpPassManager &pm, // Finally we can reconcile unrealized casts. pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass()); - -#if MLIR_GPU_TO_CUBIN_PASS_ENABLE - pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass( - options.cubinTriple, options.cubinChip, options.cubinFeatures)); -#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE } void buildLowerToNVVMPassPipeline(OpPassManager &pm, @@ -251,22 +231,16 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, //===----------------------------------------------------------------------===// // Host post-GPUModule-specific stuff. //===----------------------------------------------------------------------===// - // Convert vector to LLVM (always needed). + // Attach an NVVM target to all the GPU modules with the provided target + // options. // TODO: C++20 designated initializers. - ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; - convertVectorToLLVMPassOptions.reassociateFPReductions = true; - pm.addNestedPass<func::FuncOp>( - createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + GpuNVVMAttachTargetOptions nvvmTargetOptions; + nvvmTargetOptions.triple = options.cubinTriple; + nvvmTargetOptions.chip = options.cubinChip; + nvvmTargetOptions.features = options.cubinFeatures; + pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions)); - ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; - // Must be 64b on the host, things don't compose properly around - // gpu::LaunchOp and gpu::HostRegisterOp. - // TODO: fix GPU layering. - convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; - pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); - - // This must happen after cubin translation otherwise gpu.launch_func is - // illegal if no cubin annotation is present. + // Convert GPU to LLVM. // TODO: C++20 designated initializers. GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; // Note: hostBarePtrCallConv must be false for now otherwise @@ -277,10 +251,28 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, gpuToLLVMConversionOptions.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv; gpuToLLVMConversionOptions.useOpaquePointers = true; + // TODO: something useful here. // gpuToLLVMConversionOptions.gpuBinaryAnnotation = ""; pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); + // Serialize all GPU modules to binaries. + pm.addPass(createGpuModuleToBinaryPass()); + + // Convert vector to LLVM (always needed). + // TODO: C++20 designated initializers. + ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; + convertVectorToLLVMPassOptions.reassociateFPReductions = true; + pm.addNestedPass<func::FuncOp>( + createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + + ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; + // Must be 64b on the host, things don't compose properly around + // gpu::LaunchOp and gpu::HostRegisterOp. + // TODO: fix GPU layering. + convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; + pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); + // Convert Func to LLVM (always needed). // TODO: C++20 designated initializers. ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2; |