aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlonely eagle <2020382038@qq.com>2025-08-06 00:56:55 +0800
committerGitHub <noreply@github.com>2025-08-06 00:56:55 +0800
commite3467d8dead7b9c2b9b5012b8c0150fe2e93dee9 (patch)
tree5dcc4bfb719b3b27579884a22805eb34e6be07b7
parentf45f4ae7834e2be54cc5d2f51c413d4259ec682e (diff)
downloadllvm-e3467d8dead7b9c2b9b5012b8c0150fe2e93dee9.zip
llvm-e3467d8dead7b9c2b9b5012b8c0150fe2e93dee9.tar.gz
llvm-e3467d8dead7b9c2b9b5012b8c0150fe2e93dee9.tar.bz2
[mlir][nvgpu] Fix tma descriptor check (#152160)
The tma descriptor check does not appear to be correct, as it requires the last dimension of memref to be 128 bytes. However, the bytes of the last dimension can be equal to swizzle bytes.
-rw-r--r--mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp18
-rw-r--r--mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir42
-rw-r--r--mlir/test/Dialect/NVGPU/invalid.mlir20
3 files changed, 47 insertions, 33 deletions
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index cc03974..34c95e3 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -345,6 +345,19 @@ LogicalResult LdMatrixOp::verify() {
// NVGPU_TmaAsyncLoadOp
//===----------------------------------------------------------------------===//
+unsigned getSwizzleBytes(TensorMapSwizzleKind kind) {
+ switch (kind) {
+ case TensorMapSwizzleKind::SWIZZLE_32B:
+ return 32;
+ case TensorMapSwizzleKind::SWIZZLE_64B:
+ return 64;
+ case TensorMapSwizzleKind::SWIZZLE_128B:
+ return 128;
+ default:
+ return 0;
+ }
+}
+
std::optional<InFlightDiagnostic> verifyTmaDescriptorWithMemref(
Operation *op, nvgpu::TensorMapDescriptorType descType,
std::optional<MemRefType> memrefType = std::nullopt) {
@@ -373,10 +386,11 @@ std::optional<InFlightDiagnostic> verifyTmaDescriptorWithMemref(
descType.getSwizzle() != TensorMapSwizzleKind::SWIZZLE_NONE) {
unsigned lastDimensionByte =
descMemref.getElementTypeBitWidth() * descMemref.getShape().back() / 8;
- if (lastDimensionByte != kMaxTMALastdimByte)
+ unsigned expectByte = getSwizzleBytes(descType.getSwizzle());
+ if (lastDimensionByte != expectByte)
return op->emitError() << "the tensormap descriptor must have last "
"dimension of "
- << kMaxTMALastdimByte << " bytes but it is "
+ << expectByte << " bytes but it is "
<< lastDimensionByte << " bytes";
}
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index d0bc806..8d4f947 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -664,15 +664,15 @@ func.func @mbarrier_txcount_pred() {
// CHECK-LABEL: func @async_tma_load
!tensorMap1d = !nvgpu.tensormap.descriptor<tensor = memref<128xf32,3>, swizzle=none, l2promo = none, oob = nan, interleave = none>
-!tensorMap2d = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
-!tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x32xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none>
+!tensorMap2d = !nvgpu.tensormap.descriptor<tensor = memref<32x8xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
+!tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x16xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none>
!tensorMap4d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x32x32xf32,3>, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = none>
!tensorMap5d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x2x32x32xf32,3>, swizzle=none, l2promo = none, oob = zero, interleave = none>
!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
%buffer1d: memref<128xf32,3>,
- %buffer2d: memref<32x32xf32,3>,
- %buffer3d: memref<2x32x32xf32,3>,
+ %buffer2d: memref<32x8xf32,3>,
+ %buffer3d: memref<2x32x16xf32,3>,
%buffer4d: memref<2x2x32x32xf32,3>,
%buffer5d: memref<2x2x2x32x32xf32,3>,
%mbarrier: !mbarrier) {
@@ -682,9 +682,9 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}]
nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}]
- nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x8xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}]
- nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x16xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
@@ -708,8 +708,8 @@ func.func @async_tma_load_gpu_address_space(%tensorMap1d: !tensorMap1dgpuspace,
// CHECK-LABEL: func @async_tma_load_pred
func.func @async_tma_load_pred(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
%buffer1d: memref<128xf32,3>,
- %buffer2d: memref<32x32xf32,3>,
- %buffer3d: memref<2x32x32xf32,3>,
+ %buffer2d: memref<32x8xf32,3>,
+ %buffer3d: memref<2x32x16xf32,3>,
%buffer4d: memref<2x2x32x32xf32,3>,
%buffer5d: memref<2x2x2x32x32xf32,3>,
%mbarrier: !mbarrier,
@@ -720,9 +720,9 @@ func.func @async_tma_load_pred(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensor
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}] predicate = %{{.*}}
nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d, predicate = %p : !tensorMap1d, !mbarrier -> memref<128xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}] predicate = %{{.*}}
- nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d, predicate = %p : !tensorMap2d, !mbarrier -> memref<32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d, predicate = %p : !tensorMap2d, !mbarrier -> memref<32x8xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
- nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d, predicate = %p : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d, predicate = %p : !tensorMap3d, !mbarrier -> memref<2x32x16xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d, predicate = %p : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
@@ -734,7 +734,7 @@ func.func @async_tma_load_multicast(
%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d,
%tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d,
%tensorMap5d: !tensorMap5d, %buffer1d: memref<128xf32,3>,
- %buffer2d: memref<32x32xf32,3>, %buffer3d: memref<2x32x32xf32,3>,
+ %buffer2d: memref<32x8xf32,3>, %buffer3d: memref<2x32x16xf32,3>,
%buffer4d: memref<2x2x32x32xf32,3>, %buffer5d: memref<2x2x2x32x32xf32,3>,
%mbarrier: !mbarrier,
%multicastMask: i16) {
@@ -744,9 +744,9 @@ func.func @async_tma_load_multicast(
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}]
nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier[%c0] to %buffer1d multicast_mask = %multicastMask : !tensorMap1d, !mbarrier -> memref<128xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}]
- nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d multicast_mask = %multicastMask : !tensorMap2d, !mbarrier -> memref<32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier[%c0] to %buffer2d multicast_mask = %multicastMask : !tensorMap2d, !mbarrier -> memref<32x8xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}]
- nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d multicast_mask = %multicastMask : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3>
+ nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier[%c0] to %buffer3d multicast_mask = %multicastMask : !tensorMap3d, !mbarrier -> memref<2x32x16xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier[%c0] to %buffer4d multicast_mask = %multicastMask : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3>
// CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
@@ -756,8 +756,8 @@ func.func @async_tma_load_multicast(
func.func @async_tma_store(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
%buffer1d: memref<128xf32,3>,
- %buffer2d: memref<32x32xf32,3>,
- %buffer3d: memref<2x32x32xf32,3>,
+ %buffer2d: memref<32x8xf32,3>,
+ %buffer3d: memref<2x32x16xf32,3>,
%buffer4d: memref<2x2x32x32xf32,3>,
%buffer5d: memref<2x2x2x32x32xf32,3>) {
%c0 = arith.constant 0 : index
@@ -766,9 +766,9 @@ func.func @async_tma_store(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}]
nvgpu.tma.async.store %buffer1d to %tensorMap1d[%crd0] : memref<128xf32,3> -> !tensorMap1d
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}]
- nvgpu.tma.async.store %buffer2d to %tensorMap2d[%crd0, %crd1] : memref<32x32xf32,3> -> !tensorMap2d
+ nvgpu.tma.async.store %buffer2d to %tensorMap2d[%crd0, %crd1] : memref<32x8xf32,3> -> !tensorMap2d
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}]
- nvgpu.tma.async.store %buffer3d to %tensorMap3d[%crd0, %crd1, %crd0] : memref<2x32x32xf32,3> -> !tensorMap3d
+ nvgpu.tma.async.store %buffer3d to %tensorMap3d[%crd0, %crd1, %crd0] : memref<2x32x16xf32,3> -> !tensorMap3d
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
nvgpu.tma.async.store %buffer4d to %tensorMap4d[%crd0, %crd1, %crd1, %crd0] : memref<2x2x32x32xf32,3> -> !tensorMap4d
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
@@ -779,8 +779,8 @@ func.func @async_tma_store(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2
func.func @async_tma_store_predicate(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
%buffer1d: memref<128xf32,3>,
- %buffer2d: memref<32x32xf32,3>,
- %buffer3d: memref<2x32x32xf32,3>,
+ %buffer2d: memref<32x8xf32,3>,
+ %buffer3d: memref<2x32x16xf32,3>,
%buffer4d: memref<2x2x32x32xf32,3>,
%buffer5d: memref<2x2x2x32x32xf32,3>,
%p: i1) {
@@ -790,9 +790,9 @@ func.func @async_tma_store_predicate(%tensorMap1d: !tensorMap1d, %tensorMap2d: !
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}], predicate = %{{.*}}
nvgpu.tma.async.store %buffer1d to %tensorMap1d[%crd0], predicate = %p : memref<128xf32,3> -> !tensorMap1d
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}], predicate = %{{.*}}
- nvgpu.tma.async.store %buffer2d to %tensorMap2d[%crd0, %crd1], predicate = %p : memref<32x32xf32,3> -> !tensorMap2d
+ nvgpu.tma.async.store %buffer2d to %tensorMap2d[%crd0, %crd1], predicate = %p : memref<32x8xf32,3> -> !tensorMap2d
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
- nvgpu.tma.async.store %buffer3d to %tensorMap3d[%crd0, %crd1, %crd0], predicate = %p : memref<2x32x32xf32,3> -> !tensorMap3d
+ nvgpu.tma.async.store %buffer3d to %tensorMap3d[%crd0, %crd1, %crd0], predicate = %p : memref<2x32x16xf32,3> -> !tensorMap3d
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
nvgpu.tma.async.store %buffer4d to %tensorMap4d[%crd0, %crd1, %crd1, %crd0], predicate = %p : memref<2x2x32x32xf32,3> -> !tensorMap4d
// CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir
index b5bfbe9..2b64fa4 100644
--- a/mlir/test/Dialect/NVGPU/invalid.mlir
+++ b/mlir/test/Dialect/NVGPU/invalid.mlir
@@ -276,14 +276,14 @@ func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tR
// -----
-!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
+!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x8xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
-func.func @tma_load_1(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
+func.func @tma_load_1(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x8xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
%c0 = arith.constant 0 : index
// Pass fine
- nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3>
+ nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x8xf32,3>
// expected-error @+1 {{Maximum 5 coordinates are supported.}}
- nvgpu.tma.async.load %desc[%c0, %c0, %c0, %c0, %c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3>
+ nvgpu.tma.async.load %desc[%c0, %c0, %c0, %c0, %c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x8xf32,3>
return
}
// -----
@@ -298,17 +298,17 @@ func.func @tma_load_2(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memr
}
// -----
-!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
+!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x8xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
-func.func @tma_load_3(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
+func.func @tma_load_3(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x8xf32>, %mbarrier: !mbarrier) {
%c0 = arith.constant 0 : index
// expected-error @+1 {{the destination memref has incorrect address space, it must be shared memory address space}}
- nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer3 : !desc, !mbarrier -> memref<32x32xf32>
+ nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer3 : !desc, !mbarrier -> memref<32x8xf32>
return
}
// -----
-!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
+!desc = !nvgpu.tensormap.descriptor<tensor = memref<32x8xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
func.func @tma_load_4(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
%c0 = arith.constant 0 : index
@@ -319,7 +319,7 @@ func.func @tma_load_4(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memr
// -----
-!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf16,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
+!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf16,3>, swizzle=swizzle_128b, l2promo = none, oob = zero, interleave = none>
func.func @tma_generate_descriptor_incorrect_last_dim(%b0 : index, %b1 : index, %mem : memref<*xf16>) {
// expected-error @+1 {{the tensormap descriptor must have last dimension of 128 bytes but it is 256 bytes}}
%descA = nvgpu.tma.create.descriptor %mem box[%b0, %b1] : memref<*xf16> -> !desc
@@ -328,7 +328,7 @@ func.func @tma_generate_descriptor_incorrect_last_dim(%b0 : index, %b1 : index,
// -----
-!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
+!desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf32,3>, swizzle=swizzle_128b, l2promo = none, oob = zero, interleave = none>
!mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
func.func @tma_generate_descriptor_incorrect_last_dim(%desc: !desc, %buffer2: memref<64x128xf32,3>, %mbarrier: !mbarrier) {
%c0 = arith.constant 0 : index