aboutsummaryrefslogtreecommitdiff
path: root/mlir
diff options
context:
space:
mode:
authorJohannes Reifferscheid <jreiffers@google.com>2024-03-21 10:33:49 +0100
committerGitHub <noreply@github.com>2024-03-21 10:33:49 +0100
commita6a9215b93bcbf901cd11d2dd02cce1a245d3ffe (patch)
tree6c8474b424d56d7013de49975e23690072add76a /mlir
parentee5e027cc64957c0e18b8c38ce10d4b84314511c (diff)
downloadllvm-a6a9215b93bcbf901cd11d2dd02cce1a245d3ffe.zip
llvm-a6a9215b93bcbf901cd11d2dd02cce1a245d3ffe.tar.gz
llvm-a6a9215b93bcbf901cd11d2dd02cce1a245d3ffe.tar.bz2
Lower shuffle to single-result form if possible. (#84321)
We currently always lower shuffle to the struct-returning variant. I saw some cases where this survived all the way through ptx, resulting in increased register usage. The easiest fix is to simply lower to the single-result version when the predicate is unused.
Diffstat (limited to 'mlir')
-rw-r--r--mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp23
-rw-r--r--mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir36
2 files changed, 49 insertions, 10 deletions
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index d6a5d8cd..b95fba2 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -155,8 +155,6 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
auto valueTy = adaptor.getValue().getType();
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
auto predTy = IntegerType::get(rewriter.getContext(), 1);
- auto resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
- {valueTy, predTy});
Value one = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 1);
Value minusOne = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
@@ -176,14 +174,25 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.getWidth(), one);
}
- auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
+ bool predIsUsed = !op->getResult(1).use_empty();
+ UnitAttr returnValueAndIsValidAttr = nullptr;
+ Type resultTy = valueTy;
+ if (predIsUsed) {
+ returnValueAndIsValidAttr = rewriter.getUnitAttr();
+ resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
+ {valueTy, predTy});
+ }
Value shfl = rewriter.create<NVVM::ShflOp>(
loc, resultTy, activeMask, adaptor.getValue(), adaptor.getOffset(),
maskAndClamp, convertShflKind(op.getMode()), returnValueAndIsValidAttr);
- Value shflValue = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 0);
- Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 1);
-
- rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
+ if (predIsUsed) {
+ Value shflValue = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 0);
+ Value isActiveSrcLane =
+ rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 1);
+ rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
+ } else {
+ rewriter.replaceOp(op, {shfl, nullptr});
+ }
return success();
}
};
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index dd3b6c20..8877ee0 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -112,7 +112,7 @@ gpu.module @test_module_3 {
gpu.module @test_module_4 {
// CHECK-LABEL: func @gpu_shuffle()
- func.func @gpu_shuffle() -> (f32, f32, f32, f32) {
+ func.func @gpu_shuffle() -> (f32, f32, f32, f32, i1, i1, i1, i1) {
// CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
%arg0 = arith.constant 1.0 : f32
// CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
@@ -143,11 +143,41 @@ gpu.module @test_module_4 {
// CHECK: nvvm.shfl.sync idx {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
%shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
- func.return %shfl, %shflu, %shfld, %shfli : f32, f32,f32, f32
+ func.return %shfl, %shflu, %shfld, %shfli, %pred, %predu, %predd, %predi
+ : f32, f32,f32, f32, i1, i1, i1, i1
}
-}
+ // CHECK-LABEL: func @gpu_shuffle_unused_pred()
+ func.func @gpu_shuffle_unused_pred() -> (f32, f32, f32, f32) {
+ // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
+ %arg0 = arith.constant 1.0 : f32
+ // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
+ %arg1 = arith.constant 4 : i32
+ // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32
+ %arg2 = arith.constant 23 : i32
+ // CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
+ // CHECK: %[[#MINUS_ONE:]] = llvm.mlir.constant(-1 : i32) : i32
+ // CHECK: %[[#THIRTY_TWO:]] = llvm.mlir.constant(32 : i32) : i32
+ // CHECK: %[[#NUM_LANES:]] = llvm.sub %[[#THIRTY_TWO]], %[[#WIDTH]] : i32
+ // CHECK: %[[#MASK:]] = llvm.lshr %[[#MINUS_ONE]], %[[#NUM_LANES]] : i32
+ // CHECK: %[[#CLAMP:]] = llvm.sub %[[#WIDTH]], %[[#ONE]] : i32
+ // CHECK: %[[#SHFL:]] = nvvm.shfl.sync bfly %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] : f32 -> f32
+ %shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32
+ // CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
+ // CHECK: %[[#MINUS_ONE:]] = llvm.mlir.constant(-1 : i32) : i32
+ // CHECK: %[[#THIRTY_TWO:]] = llvm.mlir.constant(32 : i32) : i32
+ // CHECK: %[[#NUM_LANES:]] = llvm.sub %[[#THIRTY_TWO]], %[[#WIDTH]] : i32
+ // CHECK: %[[#MASK:]] = llvm.lshr %[[#MINUS_ONE]], %[[#NUM_LANES]] : i32
+ // CHECK: %[[#SHFL:]] = nvvm.shfl.sync up %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#NUM_LANES]] : f32 -> f32
+ %shflu, %predu = gpu.shuffle up %arg0, %arg1, %arg2 : f32
+ // CHECK: nvvm.shfl.sync down {{.*}} : f32 -> f32
+ %shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
+ // CHECK: nvvm.shfl.sync idx {{.*}} : f32 -> f32
+ %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
+ func.return %shfl, %shflu, %shfld, %shfli : f32, f32,f32, f32
+ }
+}
gpu.module @test_module_5 {
// CHECK-LABEL: func @gpu_sync()