diff options
author | Guray Ozen <guray.ozen@gmail.com> | 2023-09-22 17:09:43 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-22 17:09:43 +0200 |
commit | 17649a7726d3ce1ddba2bbf3ef73af03ea204753 (patch) | |
tree | 5388987a2424614d30a521f925c3f10182054a75 | |
parent | 7ff83ed6cda068d99ec2926216d9868754da6e79 (diff) | |
download | llvm-17649a7726d3ce1ddba2bbf3ef73af03ea204753.zip llvm-17649a7726d3ce1ddba2bbf3ef73af03ea204753.tar.gz llvm-17649a7726d3ce1ddba2bbf3ef73af03ea204753.tar.bz2 |
[MLIR][NVGPU] Introduce `nvgpu.mbarrier.group` for multiple mbarrier use (#65951)
A common practice involves the creation of multiple `mbarrier` objects,
see an example below. This is particularly valuable in scenarios like
software pipelining for GEMM, where we need to generate multiple
barriers dynamically use and wait them in a loop.
PR improves `nvgpu.mbarrier.barrier` type into the
`nvgpu.mbarrier.group`. All `mbarrier` related Ops now uses this type.
Consequently, these Ops are now capable of managing multiple barriers
seamlessly.
Having `num_barriers = 4` helps us to locate mbarrier object(s) into
static shared memory. We could make the value dynamic that requires
dynamic shared memory it would complicate the codegen.
```
%barriers = nvgpu.mbarrier.create -> !nvgpu.mbarrier.group<3, num_barriers = 4>
nvgpu.mbarrier.init %barriers[%c0], %num_threads : !nvgpu.mbarrier.group<3, num_barriers = 4>
nvgpu.mbarrier.init %barriers[%c1], %num_threads : !nvgpu.mbarrier.group<3, num_barriers = 4>
nvgpu.mbarrier.init %barriers[%c2], %num_threads : !nvgpu.mbarrier.group<3, num_barriers = 4>
nvgpu.mbarrier.init %barriers[%c3], %num_threads : !nvgpu.mbarrier.group<3, num_barriers = 4>
...
scf.for %i = %c0 to %n step %c1 {
nvgpu.mbarrier.try_wait %barriers[ (i % 4) ] ...
// ... Do work once mbarrier is ready
nvgpu.mbarrier.arrive.expect_tx %barriers[ (i + 3 % 4) ] ...
}
```
We will have mbarrier usages like below:
```
expect_tx[0]
expect_tx[1]
expect_tx[2]
Loop:
try_wait mbarrier[0], expect_tx[3]
try_wait mbarrier[1], expect_tx[0]
try_wait mbarrier[2], expect_tx[1]
try_wait mbarrier[3], expect_tx[2]
...
```
-rw-r--r-- | mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h | 6 | ||||
-rw-r--r-- | mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 62 | ||||
-rw-r--r-- | mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp | 135 | ||||
-rw-r--r-- | mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp | 41 | ||||
-rw-r--r-- | mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 102 | ||||
-rw-r--r-- | mlir/test/Dialect/NVGPU/tmaload-transform.mlir | 12 |
6 files changed, 208 insertions, 150 deletions
diff --git a/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h b/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h index 8c5667c..4b8d5c5 100644 --- a/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h +++ b/mlir/include/mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h @@ -23,15 +23,15 @@ class Pass; #include "mlir/Conversion/Passes.h.inc" namespace nvgpu { -class MBarrierType; +class MBarrierGroupType; /// Returns the memory space attribute of the mbarrier object. Attribute getMbarrierMemorySpace(MLIRContext *context, - MBarrierType barrierType); + MBarrierGroupType barrierType); /// Return the memref type that can be used to represent an mbarrier object. MemRefType getMBarrierMemrefType(MLIRContext *context, - MBarrierType barrierType); + MBarrierGroupType barrierType); } // namespace nvgpu void populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter, diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td index 9038164..5fcf08c 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -135,20 +135,26 @@ def NVGPU_DeviceAsyncToken : NVGPU_Type<"DeviceAsyncToken", }]; } -def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> { +def NVGPU_MBarrierGroup : NVGPU_Type<"MBarrierGroup", "mbarrier.group", []> { let summary = "mbarrier barrier type"; let description = [{ - This is the type for a mbarrier object in shared memory that is used - to synchronize a variable number of threads. + This is the type for one or more mbarrier object in shared memory that is + used to synchronize a variable number of threads. - The mbarrier object is 64 bit with 8 byte alignment. The mbarrier object - can be initiated and invalidated. + If `num_barriers` is not set, the number of mbarrier objects is 1. - See for more details: - https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object + A mbarrier object is 64 bit with 8 byte alignment. The mbarrier object + can be initiated and invalidated. + + [See for more details in PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#size-and-alignment-of-mbarrier-object) }]; - let parameters = (ins "Attribute":$memorySpace); + let parameters = (ins "Attribute":$memorySpace, DefaultValuedParameter<"unsigned", "1">:$num_barriers); let assemblyFormat = "`<` struct(params) `>`"; + let builders = [ + TypeBuilder<(ins "Attribute":$memorySpace), [{ + return $_get($_ctxt, memorySpace, 1); + }]> + ]; } def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { } @@ -486,7 +492,7 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> { def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> { let summary = "Creates a `nvgpu.mbarrier` object."; let description = [{ - The Op generates an `mbarrier` object, which is a barrier created in + The Op generates one or more `mbarrier` object, which is a barrier created in shared memory and supports various synchronization behaviors for threads. The `mbarrier` object has the following type and alignment requirements: @@ -498,9 +504,9 @@ def NVGPU_MBarrierCreateOp : NVGPU_Op<"mbarrier.create", []> { ``` }]; let arguments = (ins); - let results = (outs NVGPU_MBarrier:$barrier); + let results = (outs NVGPU_MBarrierGroup:$barriers); let assemblyFormat = [{ - attr-dict `->` type($barrier) + attr-dict `->` type($barriers) }]; } @@ -516,8 +522,8 @@ def NVGPU_MBarrierInitOp : NVGPU_Op<"mbarrier.init", []> { nvgpu.mbarrier.init %barrier, %num_threads : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, Index:$count); - let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier)"; + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$count, Index:$mbarId); + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers)"; } def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> { @@ -531,9 +537,9 @@ def NVGPU_MBarrierTestWaitOp : NVGPU_Op<"mbarrier.test.wait", []> { %isComplete = nvgpu.mbarrier.test.wait %barrier, %token : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>, !nvgpu.mbarrier.token ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, NVGPU_MBarrierToken:$token); + let arguments = (ins NVGPU_MBarrierGroup:$barriers, NVGPU_MBarrierToken:$token, Index:$mbarId); let results = (outs I1:$waitComplete); - let assemblyFormat = "$barrier `,` $token attr-dict `:` type($barrier) `,` type($token)"; + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $token attr-dict `:` type($barriers) `,` type($token)"; } def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> { @@ -550,9 +556,9 @@ def NVGPU_MBarrierArriveOp : NVGPU_Op<"mbarrier.arrive", []> { %token = nvgpu.mbarrier.arrive %barrier : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier); + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId); let results = (outs NVGPU_MBarrierToken:$token); -let assemblyFormat = "$barrier attr-dict `:` type($barrier) `->` type($token)"; +let assemblyFormat = "$barriers `[` $mbarId `]` attr-dict `:` type($barriers) `->` type($token)"; } def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", []> { @@ -568,10 +574,10 @@ def NVGPU_MBarrierArriveNoCompleteOp : NVGPU_Op<"mbarrier.arrive.nocomplete", [] %token = nvgpu.mbarrier.arrive.noComplete %barrier, %count : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> -> !nvgpu.mbarrier.token ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$mbarId, Index:$count); let results = (outs NVGPU_MBarrierToken:$token); - let assemblyFormat = "$barrier `,` $count attr-dict `:` type($barrier) `->` type($token)"; + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $count attr-dict `:` type($barriers) `->` type($token)"; } def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> { @@ -591,9 +597,8 @@ def NVGPU_MBarrierArriveExpectTxOp : NVGPU_Op<"mbarrier.arrive.expect_tx", []> { nvgpu.mbarrier.arrive.expect_tx %barrier, %ic0 : !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, - Index:$txcount); - let assemblyFormat = "$barrier `,` $txcount attr-dict `:` type($barrier)"; + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$txcount, Index:$mbarId); + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $txcount attr-dict `:` type($barriers)"; } def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> { @@ -610,8 +615,8 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> { ``` }]; - let arguments = (ins NVGPU_MBarrier:$barrier, Index:$phase, Index:$ticks); - let assemblyFormat = "$barrier `,` $phase `,` $ticks attr-dict `:` type($barrier)"; + let arguments = (ins NVGPU_MBarrierGroup:$barriers, Index:$phase, Index:$ticks, Index:$mbarId); + let assemblyFormat = "$barriers `[` $mbarId `]` `,` $phase `,` $ticks attr-dict `:` type($barriers)"; } def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> { @@ -626,12 +631,13 @@ def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> { The Op uses `$barrier` mbarrier based completion mechanism. }]; let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst, - NVGPU_MBarrier:$barrier, + NVGPU_MBarrierGroup:$barriers, NVGPU_TensorMapDescriptor:$tensorMapDescriptor, - Variadic<Index>:$coordinates); + Variadic<Index>:$coordinates, + Index:$mbarId); let assemblyFormat = [{ - $tensorMapDescriptor `[` $coordinates `]` `,` $barrier `to` $dst - attr-dict `:` type($tensorMapDescriptor) `,` type($barrier) `->` type($dst) + $tensorMapDescriptor `[` $coordinates `]` `,` $barriers `[` $mbarId `]` `to` $dst + attr-dict `:` type($tensorMapDescriptor) `,` type($barriers) `->` type($dst) }]; let hasVerifier = 1; diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index f74aa05..4d1f664 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -18,8 +18,10 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" +#include "mlir/IR/Value.h" #include "mlir/Pass/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -218,14 +220,14 @@ static SmallVector<Value> unpackOperandVector(RewriterBase &rewriter, } /// Returns whether mbarrier object has shared memory address space. -static bool isMbarrierShared(nvgpu::MBarrierType barrierType) { +static bool isMbarrierShared(nvgpu::MBarrierGroupType barrierType) { return (mlir::nvgpu::NVGPUDialect::isSharedMemoryAddressSpace( barrierType.getMemorySpace())); } /// Returns the memory space attribute of the mbarrier object. Attribute nvgpu::getMbarrierMemorySpace(MLIRContext *context, - nvgpu::MBarrierType barrierType) { + nvgpu::MBarrierGroupType barrierType) { Attribute memorySpace = {}; if (isMbarrierShared(barrierType)) { memorySpace = @@ -236,25 +238,13 @@ Attribute nvgpu::getMbarrierMemorySpace(MLIRContext *context, } /// Returns memref type of the mbarrier object. The type is defined in the -/// MBarrierType. +/// MBarrierGroupType. MemRefType nvgpu::getMBarrierMemrefType(MLIRContext *context, - nvgpu::MBarrierType barrierType) { + nvgpu::MBarrierGroupType barrierType) { Attribute memorySpace = nvgpu::getMbarrierMemorySpace(context, barrierType); MemRefLayoutAttrInterface layout; - return MemRefType::get({1}, IntegerType::get(context, 64), layout, - memorySpace); -} - -/// Returns the base pointer of the mbarrier object. -static Value getMbarrierPtr(ConversionPatternRewriter &rewriter, - const LLVMTypeConverter &typeConverter, - TypedValue<nvgpu::MBarrierType> barrier, - Value barrierMemref) { - MemRefType memrefType = - nvgpu::getMBarrierMemrefType(rewriter.getContext(), barrier.getType()); - MemRefDescriptor memRefDescriptor(barrierMemref); - return memRefDescriptor.bufferPtr(rewriter, barrier.getLoc(), typeConverter, - memrefType); + return MemRefType::get({barrierType.getNumBarriers()}, + IntegerType::get(context, 64), layout, memorySpace); } namespace { @@ -441,7 +431,7 @@ struct ConvertNVGPUToNVVMPass [&](nvgpu::WarpgroupMatrixDescriptorType type) -> Type { return converter.convertType(IntegerType::get(type.getContext(), 64)); }); - converter.addConversion([&](nvgpu::MBarrierType type) -> Type { + converter.addConversion([&](nvgpu::MBarrierGroupType type) -> Type { return converter.convertType( nvgpu::getMBarrierMemrefType(rewriter.getContext(), type)); }); @@ -779,7 +769,7 @@ struct NVGPUMBarrierCreateLowering ConversionPatternRewriter &rewriter) const override { Operation *funcOp = op->getParentOp(); MemRefType barrierType = nvgpu::getMBarrierMemrefType( - rewriter.getContext(), op.getBarrier().getType()); + rewriter.getContext(), op.getBarriers().getType()); memref::GlobalOp global; if (auto moduleOp = funcOp->getParentOfType<gpu::GPUModuleOp>()) @@ -794,21 +784,37 @@ struct NVGPUMBarrierCreateLowering } }; +/// Base class for lowering mbarrier operations to nvvm intrinsics. +template <typename SourceOp> +struct MBarrierBasePattern : public ConvertOpToLLVMPattern<SourceOp> { +public: + using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern; + /// Returns the base pointer of the mbarrier object. + Value getMbarrierPtr(Operation *op, nvgpu::MBarrierGroupType mbarType, + Value memrefDesc, Value mbarId, + ConversionPatternRewriter &rewriter) const { + MemRefType mbarrierMemrefType = + nvgpu::getMBarrierMemrefType(rewriter.getContext(), mbarType); + return ConvertToLLVMPattern::getStridedElementPtr( + op->getLoc(), mbarrierMemrefType, memrefDesc, {mbarId}, rewriter); + return memrefDesc; + } +}; + /// Lowers `nvgpu.mbarrier.init` to `nvvm.mbarrier.init` struct NVGPUMBarrierInitLowering - : public ConvertOpToLLVMPattern<nvgpu::MBarrierInitOp> { - using ConvertOpToLLVMPattern<nvgpu::MBarrierInitOp>::ConvertOpToLLVMPattern; + : public MBarrierBasePattern<nvgpu::MBarrierInitOp> { + using MBarrierBasePattern<nvgpu::MBarrierInitOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierInitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + nvgpu::MBarrierGroupType mbarrierType = op.getBarriers().getType(); rewriter.setInsertionPoint(op); - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); - + Value barrier = getMbarrierPtr(op, mbarrierType, adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount()); - - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(mbarrierType)) { rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier, count); } else { @@ -820,16 +826,17 @@ struct NVGPUMBarrierInitLowering /// Lowers `nvgpu.mbarrier.arrive` to `nvvm.mbarrier.arrive` struct NVGPUMBarrierArriveLowering - : public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveOp> { - using ConvertOpToLLVMPattern<nvgpu::MBarrierArriveOp>::ConvertOpToLLVMPattern; + : public MBarrierBasePattern<nvgpu::MBarrierArriveOp> { + using MBarrierBasePattern<nvgpu::MBarrierArriveOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierArriveOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType, barrier); } else { @@ -843,19 +850,19 @@ struct NVGPUMBarrierArriveLowering /// Lowers `nvgpu.mbarrier.arrive.nocomplete` to /// `nvvm.mbarrier.arrive.nocomplete` struct NVGPUMBarrierArriveNoCompleteLowering - : public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveNoCompleteOp> { - using ConvertOpToLLVMPattern< - nvgpu::MBarrierArriveNoCompleteOp>::ConvertOpToLLVMPattern; - + : public MBarrierBasePattern<nvgpu::MBarrierArriveNoCompleteOp> { + using MBarrierBasePattern< + nvgpu::MBarrierArriveNoCompleteOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierArriveNoCompleteOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount()); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>( op, tokenType, barrier, count); } else { @@ -868,17 +875,16 @@ struct NVGPUMBarrierArriveNoCompleteLowering /// Lowers `nvgpu.mbarrier.test.wait` to `nvvm.mbarrier.test.wait` struct NVGPUMBarrierTestWaitLowering - : public ConvertOpToLLVMPattern<nvgpu::MBarrierTestWaitOp> { - using ConvertOpToLLVMPattern< - nvgpu::MBarrierTestWaitOp>::ConvertOpToLLVMPattern; - + : public MBarrierBasePattern<nvgpu::MBarrierTestWaitOp> { + using MBarrierBasePattern<nvgpu::MBarrierTestWaitOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierTestWaitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Type retType = rewriter.getI1Type(); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>( op, retType, barrier, adaptor.getToken()); } else { @@ -890,18 +896,18 @@ struct NVGPUMBarrierTestWaitLowering }; struct NVGPUMBarrierArriveExpectTxLowering - : public ConvertOpToLLVMPattern<nvgpu::MBarrierArriveExpectTxOp> { - using ConvertOpToLLVMPattern< - nvgpu::MBarrierArriveExpectTxOp>::ConvertOpToLLVMPattern; - + : public MBarrierBasePattern<nvgpu::MBarrierArriveExpectTxOp> { + using MBarrierBasePattern< + nvgpu::MBarrierArriveExpectTxOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierArriveExpectTxOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Value txcount = truncToI32(rewriter, op->getLoc(), adaptor.getTxcount()); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveExpectTxSharedOp>( op, barrier, txcount); return success(); @@ -914,19 +920,19 @@ struct NVGPUMBarrierArriveExpectTxLowering }; struct NVGPUMBarrierTryWaitParityLowering - : public ConvertOpToLLVMPattern<nvgpu::MBarrierTryWaitParityOp> { - using ConvertOpToLLVMPattern< - nvgpu::MBarrierTryWaitParityOp>::ConvertOpToLLVMPattern; - + : public MBarrierBasePattern<nvgpu::MBarrierTryWaitParityOp> { + using MBarrierBasePattern< + nvgpu::MBarrierTryWaitParityOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::MBarrierTryWaitParityOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); Value ticks = truncToI32(rewriter, op->getLoc(), adaptor.getTicks()); Value phase = truncToI32(rewriter, op->getLoc(), adaptor.getPhase()); - if (isMbarrierShared(op.getBarrier().getType())) { + if (isMbarrierShared(op.getBarriers().getType())) { rewriter.replaceOpWithNewOp<NVVM::MBarrierTryWaitParitySharedOp>( op, barrier, phase, ticks); return success(); @@ -939,16 +945,17 @@ struct NVGPUMBarrierTryWaitParityLowering }; struct NVGPUTmaAsyncLoadOpLowering - : public ConvertOpToLLVMPattern<nvgpu::TmaAsyncLoadOp> { - using ConvertOpToLLVMPattern<nvgpu::TmaAsyncLoadOp>::ConvertOpToLLVMPattern; + : public MBarrierBasePattern<nvgpu::TmaAsyncLoadOp> { + using MBarrierBasePattern<nvgpu::TmaAsyncLoadOp>::MBarrierBasePattern; LogicalResult matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto srcMemrefType = cast<MemRefType>(op.getDst().getType()); Value dest = getStridedElementPtr(op->getLoc(), srcMemrefType, adaptor.getDst(), {}, rewriter); - Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(), - op.getBarrier(), adaptor.getBarrier()); + Value barrier = + getMbarrierPtr(op, op.getBarriers().getType(), adaptor.getBarriers(), + adaptor.getMbarId(), rewriter); SmallVector<Value> coords = adaptor.getCoordinates(); for (auto [index, value] : llvm::enumerate(coords)) { diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp index 680c21a..94d7d56 100644 --- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -70,7 +70,7 @@ void transform::ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns( LLVM::LLVMStructType::getLiteral(type.getContext(), structBody); return llvmTypeConverter.convertType(convertedType); }); - llvmTypeConverter.addConversion([&](nvgpu::MBarrierType type) -> Type { + llvmTypeConverter.addConversion([&](nvgpu::MBarrierGroupType type) -> Type { return llvmTypeConverter.convertType( getMBarrierMemrefType(type.getContext(), type)); }); @@ -818,7 +818,7 @@ struct HopperBuilder { HopperBuilder(RewriterBase &rewriter, Location loc) : rewriter(rewriter), loc(loc) {} - TypedValue<nvgpu::MBarrierType> + TypedValue<nvgpu::MBarrierGroupType> buildAndInitBarrierInSharedMemory(OpFoldResult numThreads); /// Create tma descriptor op to initiate transfer from global to shared @@ -832,9 +832,9 @@ struct HopperBuilder { OpFoldResult buildTmaAsyncLoad(TypedValue<nvgpu::TensorMapDescriptorType> globalDesc, TypedValue<MemRefType> sharedMemref, - TypedValue<nvgpu::MBarrierType> barrier, + TypedValue<nvgpu::MBarrierGroupType> barrier, SmallVectorImpl<Operation *> &loadOps); - void buildBarrierArriveTx(TypedValue<nvgpu::MBarrierType> barrier, + void buildBarrierArriveTx(TypedValue<nvgpu::MBarrierGroupType> barrier, ArrayRef<OpFoldResult> sizes); /// If threadIdx.x == 0 does TMA request + wait, else just wait. @@ -843,9 +843,9 @@ struct HopperBuilder { SmallVector<Operation *> buildPredicateLoadsOnThread0( ArrayRef<TypedValue<nvgpu::TensorMapDescriptorType>> globalDescriptors, ArrayRef<TypedValue<MemRefType>> sharedMemBuffers, - TypedValue<nvgpu::MBarrierType> barrier); + TypedValue<nvgpu::MBarrierGroupType> barrier); - void buildTryWaitParity(TypedValue<nvgpu::MBarrierType> barrier); + void buildTryWaitParity(TypedValue<nvgpu::MBarrierGroupType> barrier); RewriterBase &rewriter; Location loc; @@ -854,7 +854,7 @@ struct HopperBuilder { SmallVector<Operation *> HopperBuilder::buildPredicateLoadsOnThread0( ArrayRef<TypedValue<nvgpu::TensorMapDescriptorType>> globalDescriptors, ArrayRef<TypedValue<MemRefType>> sharedMemBuffers, - TypedValue<nvgpu::MBarrierType> barrier) { + TypedValue<nvgpu::MBarrierGroupType> barrier) { SmallVector<Operation *> loadOps; Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0); Value tidx = rewriter.create<gpu::ThreadIdOp>(loc, gpu::Dimension::x); @@ -895,15 +895,18 @@ static Attribute getSharedAddressSpaceAttribute(OpBuilder &b) { // return b.getI64IntegerAttr(static_cast<int64_t>(kSharedMemorySpace)); } -TypedValue<nvgpu::MBarrierType> +TypedValue<nvgpu::MBarrierGroupType> HopperBuilder::buildAndInitBarrierInSharedMemory(OpFoldResult numThreads) { auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter); Value barrier = rewriter.create<nvgpu::MBarrierCreateOp>( - loc, nvgpu::MBarrierType::get(rewriter.getContext(), sharedMemorySpace)); + loc, + nvgpu::MBarrierGroupType::get(rewriter.getContext(), sharedMemorySpace)); + Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0); rewriter.create<nvgpu::MBarrierInitOp>( - loc, barrier, getValueOrCreateConstantIndexOp(rewriter, loc, numThreads)); + loc, barrier, getValueOrCreateConstantIndexOp(rewriter, loc, numThreads), + zero); rewriter.create<gpu::BarrierOp>(loc); - return cast<TypedValue<nvgpu::MBarrierType>>(barrier); + return cast<TypedValue<nvgpu::MBarrierGroupType>>(barrier); } TypedValue<nvgpu::TensorMapDescriptorType> @@ -938,12 +941,12 @@ HopperBuilder::buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref, OpFoldResult HopperBuilder::buildTmaAsyncLoad( TypedValue<nvgpu::TensorMapDescriptorType> globalDesc, TypedValue<MemRefType> sharedMemref, - TypedValue<nvgpu::MBarrierType> barrier, + TypedValue<nvgpu::MBarrierGroupType> barrier, SmallVectorImpl<Operation *> &loadOps) { MLIRContext *ctx = rewriter.getContext(); Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0); Operation *loadOp = rewriter.create<nvgpu::TmaAsyncLoadOp>( - loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero}); + loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero}, zero); loadOps.push_back(loadOp); auto mixedSizes = memref::getMixedSizes(rewriter, loc, sharedMemref); SmallVector<AffineExpr> symbols(mixedSizes.size()); @@ -957,7 +960,7 @@ OpFoldResult HopperBuilder::buildTmaAsyncLoad( } void HopperBuilder::buildBarrierArriveTx( - TypedValue<nvgpu::MBarrierType> barrier, + TypedValue<nvgpu::MBarrierGroupType> barrier, ArrayRef<OpFoldResult> mixedSizes) { assert(!mixedSizes.empty() && "expecte non-empty sizes"); MLIRContext *ctx = rewriter.getContext(); @@ -967,19 +970,21 @@ void HopperBuilder::buildBarrierArriveTx( OpFoldResult size = affine::makeComposedFoldedAffineApply(rewriter, loc, sumExpr, mixedSizes); Value sizeVal = getValueOrCreateConstantIndexOp(rewriter, loc, size); - rewriter.create<nvgpu::MBarrierArriveExpectTxOp>(loc, barrier, sizeVal); + Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0); + rewriter.create<nvgpu::MBarrierArriveExpectTxOp>(loc, barrier, sizeVal, zero); } void HopperBuilder::buildTryWaitParity( - TypedValue<nvgpu::MBarrierType> barrier) { + TypedValue<nvgpu::MBarrierGroupType> barrier) { Value parity = rewriter.create<arith::ConstantIndexOp>(loc, 0); // 10M is an arbitrary, not too small or too big number to specify the number // of ticks before retry. // TODO: hoist this in a default dialect constant. Value ticksBeforeRetry = rewriter.create<arith::ConstantIndexOp>(loc, 10000000); + Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0); rewriter.create<nvgpu::MBarrierTryWaitParityOp>(loc, barrier, parity, - ticksBeforeRetry); + ticksBeforeRetry, zero); } //===----------------------------------------------------------------------===// @@ -1013,7 +1018,7 @@ SmallVector<Operation *> CopyBuilder::rewrite(ArrayRef<Operation *> copyOps) { ArrayRef<OpFoldResult>{launchOp.getBlockSizeX(), launchOp.getBlockSizeY(), launchOp.getBlockSizeZ()}); - TypedValue<nvgpu::MBarrierType> barrier = + TypedValue<nvgpu::MBarrierGroupType> barrier = buildAndInitBarrierInSharedMemory(numThreads); SmallVector<TypedValue<MemRefType>> shmems; diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index f011007..8c2f8dbb 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -470,28 +470,34 @@ func.func @mma_sp_sync_i8_16864(%arg0: vector<4x4xi8>, return %d : vector<2x2xi32> } -!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> +!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> !tokenType = !nvgpu.mbarrier.token // CHECK-LABEL: func @mbarrier func.func @mbarrier() { %num_threads = arith.constant 128 : index + // CHECK: %[[c0:.+]] = arith.constant 0 : index + // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64 + %c0 = arith.constant 0 : index // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3> %barrier = nvgpu.mbarrier.create -> !barrierType // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] - nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType - // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]] - %token = nvgpu.mbarrier.arrive %barrier : !barrierType -> !tokenType + %token = nvgpu.mbarrier.arrive %barrier[%c0] : !barrierType -> !tokenType - // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] - %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType + %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType func.return } @@ -500,63 +506,96 @@ func.func @mbarrier() { func.func @mbarrier_nocomplete() { %num_threads = arith.constant 128 : index %count = arith.constant 12 : index + // CHECK: %[[c0:.+]] = arith.constant 0 : index + // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64 + %c0 = arith.constant 0 : index // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3> %barrier = nvgpu.mbarrier.create -> !barrierType // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] - nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType - // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]] - %token = nvgpu.mbarrier.arrive.nocomplete %barrier, %count : !barrierType -> !tokenType + %token = nvgpu.mbarrier.arrive.nocomplete %barrier[%c0], %count : !barrierType -> !tokenType - // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] - %isDone = nvgpu.mbarrier.test.wait %barrier, %token : !barrierType, !tokenType + %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType func.return } +// CHECK-LABEL: func @mbarrier_wait +func.func @mbarrier_wait(%barriers : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, %token : !tokenType) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %n = arith.constant 100 : index + + %numBarriers = arith.constant 5 : index + + scf.for %i = %c0 to %n step %c1 { +// CHECK: %[[c5:.+]] = arith.constant 5 : index +// CHECK: scf.for %[[i:.*]] = +// CHECK: %[[S2:.+]] = arith.remui %[[i]], %[[c5]] : index +// CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[S2]] : index to i64 +// CHECK: %[[S4:.+]] = llvm.extractvalue %0[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> +// CHECK: %[[S5:.+]] = llvm.getelementptr %[[S4]][%[[S3]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 + %mbarId = arith.remui %i, %numBarriers : index + %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, !tokenType + } + return +} + // CHECK-LABEL: func @mbarrier_txcount func.func @mbarrier_txcount() { - %num_threads = arith.constant 128 : index + %num_threads = arith.constant 128 : index + // CHECK: %[[c0:.+]] = arith.constant 0 : index + // CHECK: %[[mid:.+]] = builtin.unrealized_conversion_cast %[[c0]] : index to i64 + %c0 = arith.constant 0 : index // CHECK: %[[barMemref:.+]] = memref.get_global @__mbarrier{{.*}} : memref<1xi64, 3> %barrier = nvgpu.mbarrier.create -> !barrierType // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: %[[barPtr:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] - nvgpu.mbarrier.init %barrier, %num_threads : !barrierType + nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType - %c0 = arith.constant 0 : index %tidxreg = nvvm.read.ptx.sreg.tid.x : i32 %tidx = arith.index_cast %tidxreg : i32 to index %cnd = arith.cmpi eq, %tidx, %c0 : index scf.if %cnd { %txcount = arith.constant 256 : index - // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]] - nvgpu.mbarrier.arrive.expect_tx %barrier, %txcount : !barrierType + nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount : !barrierType scf.yield } else { %txcount = arith.constant 0 : index - // CHECK: %[[barPtr2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.arrive.expect_tx.shared %[[barPtr2]] - nvgpu.mbarrier.arrive.expect_tx %barrier, %txcount : !barrierType + nvgpu.mbarrier.arrive.expect_tx %barrier[%c0], %txcount : !barrierType scf.yield } %phase = arith.constant 0 : index %ticks = arith.constant 10000000 : index - // CHECK: %[[barPtr3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 // CHECK: nvvm.mbarrier.try_wait.parity.shared %[[barPtr3]] - nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !barrierType + nvgpu.mbarrier.try_wait.parity %barrier[%c0], %phase, %ticks : !barrierType func.return } @@ -567,7 +606,7 @@ func.func @mbarrier_txcount() { !tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x32xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none> !tensorMap4d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x32x32xf32,3>, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = interleave_16b> !tensorMap5d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x2x32x32xf32,3>, swizzle=none, l2promo = none, oob = zero, interleave = none> -!mbarrier = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>> +!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>, %buffer2d: memref<32x32xf32,3>, @@ -575,18 +614,19 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d %buffer4d: memref<2x2x32x32xf32,3>, %buffer5d: memref<2x2x2x32x32xf32,3>, %mbarrier: !mbarrier) { + %c0 = arith.constant 0 : index %crd0 = arith.constant 0 : index %crd1 = arith.constant 0 : index // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}] - nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3> + nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3> + nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier[%c0] to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3> func.return } @@ -621,12 +661,12 @@ module @mymodule { %rhsShmem3 = memref.subview %rhsShmem2[1,0,0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16,3> to memref<1x64x128xf16, strided<[8192, 128, 1], offset: 8192>, 3> %rhsShmem = memref.subview %rhsShmem3[0,0,0][1, 64, 128][1, 1, 1] : memref<1x64x128xf16, strided<[8192, 128, 1], offset: 8192>, 3> to !shmemrhs // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global - nvgpu.tma.async.load %lhsTensorMap[%c0, %c0], %mbarrier to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs + nvgpu.tma.async.load %lhsTensorMap[%c0, %c0], %mbarrier[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs // CHECK: %[[desc:.+]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[c8192:.+]] = llvm.mlir.constant(8192 : index) : i64 // CHECK: %[[shmemOfset:.+]] = llvm.getelementptr %[[desc]][%[[c8192]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %[[shmemOfset]], %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32 - nvgpu.tma.async.load %rhsTensorMap[%c0, %c0], %mbarrier to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs + nvgpu.tma.async.load %rhsTensorMap[%c0, %c0], %mbarrier[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs return } } diff --git a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir index 646008b..30f8c45 100644 --- a/mlir/test/Dialect/NVGPU/tmaload-transform.mlir +++ b/mlir/test/Dialect/NVGPU/tmaload-transform.mlir @@ -34,7 +34,7 @@ func.func @main() { %out_1 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, #gpu.address_space<workgroup>> // CHECK: %[[B:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup> - // CHECK: nvgpu.mbarrier.init %[[B]], %{{.*}} : <memorySpace = #gpu.address_space<workgroup> + // CHECK: nvgpu.mbarrier.init %[[B]][%{{.*}}], %{{.*}} : <memorySpace = #gpu.address_space<workgroup> // CHECK: gpu.barrier // // CHECK: %[[c0:.*]] = arith.constant 0 : index @@ -44,27 +44,27 @@ func.func @main() { // CHECK: scf.if %[[CMP]] { // // CHECK: %[[c0_7:.*]] = arith.constant 0 : index - // CHECK: nvgpu.tma.async.load %[[D1]][%[[c0_7]], %[[c0_7]]], %[[B]] to %[[G1]] + // CHECK: nvgpu.tma.async.load %[[D1]][%[[c0_7]], %[[c0_7]]], %[[B]][%{{.*}}] to %[[G1]] // CHECK-SAME: : <tensor = memref<64x8xf32, #gpu.address_space<workgroup>>, // CHECK-SAME: swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup> // CHECK-SAME: -> memref<64x8xf32, #gpu.address_space<workgroup>> // // CHECK: %[[c0_8:.*]] = arith.constant 0 : index - // CHECK: nvgpu.tma.async.load %[[D2]][%[[c0_8]], %[[c0_8]]], %[[B]] to %[[G2]] + // CHECK: nvgpu.tma.async.load %[[D2]][%[[c0_8]], %[[c0_8]]], %[[B]][%{{.*}}] to %[[G2]] // CHECK-SAME: : <tensor = memref<8x128xf32, #gpu.address_space<workgroup>>, // CHECK-SAME: swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup> // CHECK-SAME: -> memref<8x128xf32, #gpu.address_space<workgroup>> // // CHECK: %[[c6144:.*]] = arith.constant 6144 : index - // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]], %[[c6144]] : <memorySpace = #gpu.address_space<workgroup> + // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c6144]] : <memorySpace = #gpu.address_space<workgroup> // CHECK: } else { // CHECK: %[[c0_7:.*]] = arith.constant 0 : index - // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]], %[[c0_7]] : <memorySpace = #gpu.address_space<workgroup> + // CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c0_7]] : <memorySpace = #gpu.address_space<workgroup> // CHECK: } // // CHECK: %[[c0_6:.*]] = arith.constant 0 : index // CHECK: %[[c10000000:.*]] = arith.constant 10000000 : index - // CHECK: nvgpu.mbarrier.try_wait.parity %[[B]], %[[c0_6]], %[[c10000000]] : <memorySpace = #gpu.address_space<workgroup> + // CHECK: nvgpu.mbarrier.try_wait.parity %[[B]][%{{.*}}], %[[c0_6]], %[[c10000000]] : <memorySpace = #gpu.address_space<workgroup> /// Both copies are matched and end up in the same async group. linalg.copy ins(%memref: memref<64x8xf32>) outs(%out: memref<64x8xf32, #gpu.address_space<workgroup>>) |