diff options
author | Ivan Butygin <ivan.butygin@gmail.com> | 2025-07-17 23:42:25 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-18 00:42:25 +0300 |
commit | 6b29ee9d9a8dc6eaf1f47b4d66b4c569e00a112f (patch) | |
tree | 5a3eee53e64d9bc7ea206069d28fe0f0117ac25b | |
parent | b8264293a714347a77f150b109cfdde8665eeadc (diff) | |
download | llvm-6b29ee9d9a8dc6eaf1f47b4d66b4c569e00a112f.zip llvm-6b29ee9d9a8dc6eaf1f47b4d66b4c569e00a112f.tar.gz llvm-6b29ee9d9a8dc6eaf1f47b4d66b4c569e00a112f.tar.bz2 |
[mlir][amdgpu] Properly handle mismatching memref ranks in `amdgpu.gather_to_lds` (#149407)
This op doesn't have any rank or indices restrictions on src/dst
memrefs, but was using `SameVariadicOperandSize` which was causing
issues. Also fix some other issues while we at it.
-rw-r--r-- | mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 16 | ||||
-rw-r--r-- | mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 4 | ||||
-rw-r--r-- | mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir | 7 | ||||
-rw-r--r-- | mlir/test/Dialect/AMDGPU/invalid.mlir | 8 | ||||
-rw-r--r-- | mlir/test/Dialect/AMDGPU/ops.mlir | 11 |
5 files changed, 36 insertions, 10 deletions
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index eadb5d9..80959ffb 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp let summary = "Extend a vector of packed floating point values"; let description = [{ - Extend and scale two packed floats in `source[index]` to two floats and + Extend and scale two packed floats in `source[index]` to two floats and return them. This rather unusual signature arises from the fact that AMD GPUs cannot @@ -861,7 +861,7 @@ def AMDGPU_WMMAOp : } def AMDGPU_GatherToLDSOp : - AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>, + AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>, Arguments<(ins Arg<AnyMemRef, "buffer to gather from", [MemRead]>:$src, Variadic<Index>:$srcIndices, @@ -966,13 +966,13 @@ def AMDGPU_ScaledMFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences: - - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and - fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile - size. - - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` + - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and + fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile + size. + - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` are omitted from this wrapper. - - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for - double-precision operations on gfx94x and so are not included here. + - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for + double-precision operations on gfx94x and so are not included here. }]; let assemblyFormat = [{ `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index acaf6a2..88c2eb3 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -134,6 +134,8 @@ static bool hasGlobalMemorySpace(Attribute memorySpace) { } static bool hasWorkgroupMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace)) return intMemorySpace.getInt() == 3; if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace)) @@ -142,6 +144,8 @@ static bool hasWorkgroupMemorySpace(Attribute memorySpace) { } static bool hasFatRawBufferMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace)) return intMemorySpace.getInt() == 7; if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace)) diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir index 77103fa..e48c941 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir @@ -127,12 +127,15 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[ALLOC:.*]] = memref.alloc() // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64 // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]] // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1] // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64 // CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64 - // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]] + // CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64 + // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]] // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4 %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace> %c0 = arith.constant 0 : index @@ -151,7 +154,7 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat // CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64 + // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64 // CHECK: %[[C12:.*]] = arith.constant 12 : index // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]] // CHECK: %[[C32:.*]] = arith.constant 32 : index diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 6d55583..0d2fd24 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -222,3 +222,11 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6> func.return %0 : vector<8xi6> } + +// ----- + +func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) { + // expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}} + amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16> + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 51f3bbd..5559ac8 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -493,3 +493,14 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16 %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16> func.return %0 : vector<4xf16> } + +// CHECK-LABEL: func @gather_to_lds +func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf16, #gpu.address_space<workgroup>>) { + // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}] + // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}] + // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}] + amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>> + amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1] : vector<2xf16>, memref<32x32xf16>, memref<32xf16, #gpu.address_space<workgroup>> + amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>> + func.return +} |