diff options
author | lonely eagle <2020382038@qq.com> | 2025-08-06 00:56:55 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-08-06 00:56:55 +0800 |
commit | e3467d8dead7b9c2b9b5012b8c0150fe2e93dee9 (patch) | |
tree | 5dcc4bfb719b3b27579884a22805eb34e6be07b7 | |
parent | f45f4ae7834e2be54cc5d2f51c413d4259ec682e (diff) | |
download | llvm-e3467d8dead7b9c2b9b5012b8c0150fe2e93dee9.zip llvm-e3467d8dead7b9c2b9b5012b8c0150fe2e93dee9.tar.gz llvm-e3467d8dead7b9c2b9b5012b8c0150fe2e93dee9.tar.bz2 |
[mlir][nvgpu] Fix tma descriptor check (#152160)
The tma descriptor check does not appear to be correct, as it requires
the last dimension of memref to be 128 bytes. However, the bytes of the
last dimension can be equal to swizzle bytes.
-rw-r--r-- | mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp | 18 | ||||
-rw-r--r-- | mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 42 | ||||
-rw-r--r-- | mlir/test/Dialect/NVGPU/invalid.mlir | 20 |
3 files changed, 47 insertions, 33 deletions
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp index cc03974..34c95e3 100644 --- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp +++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp @@ -345,6 +345,19 @@ LogicalResult LdMatrixOp::verify() { // NVGPU_TmaAsyncLoadOp //===----------------------------------------------------------------------===// +unsigned getSwizzleBytes(TensorMapSwizzleKind kind) { + switch (kind) { + case TensorMapSwizzleKind::SWIZZLE_32B: + return 32; + case TensorMapSwizzleKind::SWIZZLE_64B: + return 64; + case TensorMapSwizzleKind::SWIZZLE_128B: + return 128; + default: + return 0; + } +} + std::optional<InFlightDiagnostic> verifyTmaDescriptorWithMemref( Operation *op, nvgpu::TensorMapDescriptorType descType, std::optional<MemRefType> memrefType = std::nullopt) { @@ -373,10 +386,11 @@ std::optional<InFlightDiagnostic> verifyTmaDescriptorWithMemref( descType.getSwizzle() != TensorMapSwizzleKind::SWIZZLE_NONE) { unsigned lastDimensionByte = descMemref.getElementTypeBitWidth() * descMemref.getShape().back() / 8; - if (lastDimensionByte != kMaxTMALastdimByte) + unsigned expectByte = getSwizzleBytes(descType.getSwizzle()); + if (lastDimensionByte != expectByte) return op->emitError() << "the tensormap descriptor must have last " "dimension of " - << kMaxTMALastdimByte << " bytes but it is " + << expectByte << " bytes but it is " << lastDimensionByte << " bytes"; } diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index d0bc806..8d4f947 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -664,15 +664,15 @@ func.func @mbarrier_txcount_pred() { // CHECK-LABEL: func @async_tma_load !tensorMap1d = !nvgpu.tensormap.descriptor<tensor = memref<128xf32,3>, swizzle=none, l2promo = none, oob = nan, interleave = none> -!tensorMap2d = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> -!tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x32xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none> +!tensorMap2d = !nvgpu.tensormap.descriptor<tensor = memref<32x8xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> +!tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x16xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none> !tensorMap4d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x32x32xf32,3>, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = none> !tensorMap5d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x2x32x32xf32,3>, swizzle=none, l2promo = none, oob = zero, interleave = none> !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>, - %buffer2d: memref<32x32xf32,3>, - %buffer3d: memref<2x32x32xf32,3>, + %buffer2d: memref<32x8xf32,3>, + %buffer3d: memref<2x32x16xf32,3>, %buffer4d: memref<2x2x32x32xf32,3>, %buffer5d: memref<2x2x2x32x32xf32,3>, %mbarrier: !mbarrier) { @@ -682,9 +682,9 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}] nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3> + nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x8xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x16xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] @@ -708,8 +708,8 @@ func.func @async_tma_load_gpu_address_space(%tensorMap1d: !tensorMap1dgpuspace, // CHECK-LABEL: func @async_tma_load_pred func.func @async_tma_load_pred(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>, - %buffer2d: memref<32x32xf32,3>, - %buffer3d: memref<2x32x32xf32,3>, + %buffer2d: memref<32x8xf32,3>, + %buffer3d: memref<2x32x16xf32,3>, %buffer4d: memref<2x2x32x32xf32,3>, %buffer5d: memref<2x2x2x32x32xf32,3>, %mbarrier: !mbarrier, @@ -720,9 +720,9 @@ func.func @async_tma_load_pred(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensor // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}] predicate = %{{.*}} nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d, predicate = %p : !tensorMap1d, !mbarrier -> memref<128xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}] predicate = %{{.*}} - nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d, predicate = %p : !tensorMap2d, !mbarrier -> memref<32x32xf32,3> + nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d, predicate = %p : !tensorMap2d, !mbarrier -> memref<32x8xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}} - nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d, predicate = %p : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d, predicate = %p : !tensorMap3d, !mbarrier -> memref<2x32x16xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}} nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d, predicate = %p : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}} @@ -734,7 +734,7 @@ func.func @async_tma_load_multicast( %tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>, - %buffer2d: memref<32x32xf32,3>, %buffer3d: memref<2x32x32xf32,3>, + %buffer2d: memref<32x8xf32,3>, %buffer3d: memref<2x32x16xf32,3>, %buffer4d: memref<2x2x32x32xf32,3>, %buffer5d: memref<2x2x2x32x32xf32,3>, %mbarrier: !mbarrier, %multicastMask: i16) { @@ -744,9 +744,9 @@ func.func @async_tma_load_multicast( // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}] nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d multicast_mask = %multicastMask : !tensorMap1d, !mbarrier -> memref<128xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d multicast_mask = %multicastMask : !tensorMap2d, !mbarrier -> memref<32x32xf32,3> + nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d multicast_mask = %multicastMask : !tensorMap2d, !mbarrier -> memref<32x8xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d multicast_mask = %multicastMask : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3> + nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d multicast_mask = %multicastMask : !tensorMap3d, !mbarrier -> memref<2x32x16xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d multicast_mask = %multicastMask : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3> // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] @@ -756,8 +756,8 @@ func.func @async_tma_load_multicast( func.func @async_tma_store(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>, - %buffer2d: memref<32x32xf32,3>, - %buffer3d: memref<2x32x32xf32,3>, + %buffer2d: memref<32x8xf32,3>, + %buffer3d: memref<2x32x16xf32,3>, %buffer4d: memref<2x2x32x32xf32,3>, %buffer5d: memref<2x2x2x32x32xf32,3>) { %c0 = arith.constant 0 : index @@ -766,9 +766,9 @@ func.func @async_tma_store(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}] nvgpu.tma.async.store %buffer1d to %tensorMap1d[%crd0] : memref<128xf32,3> -> !tensorMap1d // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}] - nvgpu.tma.async.store %buffer2d to %tensorMap2d[%crd0, %crd1] : memref<32x32xf32,3> -> !tensorMap2d + nvgpu.tma.async.store %buffer2d to %tensorMap2d[%crd0, %crd1] : memref<32x8xf32,3> -> !tensorMap2d // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}] - nvgpu.tma.async.store %buffer3d to %tensorMap3d[%crd0, %crd1, %crd0] : memref<2x32x32xf32,3> -> !tensorMap3d + nvgpu.tma.async.store %buffer3d to %tensorMap3d[%crd0, %crd1, %crd0] : memref<2x32x16xf32,3> -> !tensorMap3d // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] nvgpu.tma.async.store %buffer4d to %tensorMap4d[%crd0, %crd1, %crd1, %crd0] : memref<2x2x32x32xf32,3> -> !tensorMap4d // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] @@ -779,8 +779,8 @@ func.func @async_tma_store(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2 func.func @async_tma_store_predicate(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>, - %buffer2d: memref<32x32xf32,3>, - %buffer3d: memref<2x32x32xf32,3>, + %buffer2d: memref<32x8xf32,3>, + %buffer3d: memref<2x32x16xf32,3>, %buffer4d: memref<2x2x32x32xf32,3>, %buffer5d: memref<2x2x2x32x32xf32,3>, %p: i1) { @@ -790,9 +790,9 @@ func.func @async_tma_store_predicate(%tensorMap1d: !tensorMap1d, %tensorMap2d: ! // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}], predicate = %{{.*}} nvgpu.tma.async.store %buffer1d to %tensorMap1d[%crd0], predicate = %p : memref<128xf32,3> -> !tensorMap1d // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}], predicate = %{{.*}} - nvgpu.tma.async.store %buffer2d to %tensorMap2d[%crd0, %crd1], predicate = %p : memref<32x32xf32,3> -> !tensorMap2d + nvgpu.tma.async.store %buffer2d to %tensorMap2d[%crd0, %crd1], predicate = %p : memref<32x8xf32,3> -> !tensorMap2d // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}} - nvgpu.tma.async.store %buffer3d to %tensorMap3d[%crd0, %crd1, %crd0], predicate = %p : memref<2x32x32xf32,3> -> !tensorMap3d + nvgpu.tma.async.store %buffer3d to %tensorMap3d[%crd0, %crd1, %crd0], predicate = %p : memref<2x32x16xf32,3> -> !tensorMap3d // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}} nvgpu.tma.async.store %buffer4d to %tensorMap4d[%crd0, %crd1, %crd1, %crd0], predicate = %p : memref<2x2x32x32xf32,3> -> !tensorMap4d // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}} diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir index b5bfbe9..2b64fa4 100644 --- a/mlir/test/Dialect/NVGPU/invalid.mlir +++ b/mlir/test/Dialect/NVGPU/invalid.mlir @@ -276,14 +276,14 @@ func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tR // ----- -!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> +!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x8xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> -func.func @tma_load_1(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) { +func.func @tma_load_1(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x8xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) { %c0 = arith.constant 0 : index // Pass fine - nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3> + nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x8xf32,3> // expected-error @+1 {{Maximum 5 coordinates are supported.}} - nvgpu.tma.async.load %desc[%c0, %c0, %c0, %c0, %c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3> + nvgpu.tma.async.load %desc[%c0, %c0, %c0, %c0, %c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x8xf32,3> return } // ----- @@ -298,17 +298,17 @@ func.func @tma_load_2(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memr } // ----- -!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> +!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x8xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> -func.func @tma_load_3(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) { +func.func @tma_load_3(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x8xf32>, %mbarrier: !mbarrier) { %c0 = arith.constant 0 : index // expected-error @+1 {{the destination memref has incorrect address space, it must be shared memory address space}} - nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer3 : !desc, !mbarrier -> memref<32x32xf32> + nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer3 : !desc, !mbarrier -> memref<32x8xf32> return } // ----- -!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> +!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x8xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> func.func @tma_load_4(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) { %c0 = arith.constant 0 : index @@ -319,7 +319,7 @@ func.func @tma_load_4(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memr // ----- -!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf16,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> +!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf16,3>, swizzle=swizzle_128b, l2promo = none, oob = zero, interleave = none> func.func @tma_generate_descriptor_incorrect_last_dim(%b0 : index, %b1 : index, %mem : memref<*xf16>) { // expected-error @+1 {{the tensormap descriptor must have last dimension of 128 bytes but it is 256 bytes}} %descA = nvgpu.tma.create.descriptor %mem box[%b0, %b1] : memref<*xf16> -> !desc @@ -328,7 +328,7 @@ func.func @tma_generate_descriptor_incorrect_last_dim(%b0 : index, %b1 : index, // ----- -!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none> +!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf32,3>, swizzle=swizzle_128b, l2promo = none, oob = zero, interleave = none> !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>> func.func @tma_generate_descriptor_incorrect_last_dim(%desc: !desc, %buffer2: memref<64x128xf32,3>, %mbarrier: !mbarrier) { %c0 = arith.constant 0 : index |