aboutsummaryrefslogtreecommitdiff
path: root/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'flang/lib/Optimizer/Builder/IntrinsicCall.cpp')
-rw-r--r--flang/lib/Optimizer/Builder/IntrinsicCall.cpp99
1 files changed, 92 insertions, 7 deletions
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 2c21868..7c5c5fb 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -346,6 +346,14 @@ static constexpr IntrinsicHandler handlers[]{
&I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>,
{{{"mask", asValue}, {"pred", asValue}}},
/*isElemental=*/false},
+ {"barrier_arrive",
+ &I::genBarrierArrive,
+ {{{"barrier", asAddr}}},
+ /*isElemental=*/false},
+ {"barrier_arrive_cnt",
+ &I::genBarrierArriveCnt,
+ {{{"barrier", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
{"barrier_init",
&I::genBarrierInit,
{{{"barrier", asAddr}, {"count", asValue}}},
@@ -494,6 +502,10 @@ static constexpr IntrinsicHandler handlers[]{
&I::genExtendsTypeOf,
{{{"a", asBox}, {"mold", asBox}}},
/*isElemental=*/false},
+ {"fence_proxy_async",
+ &I::genFenceProxyAsync,
+ {},
+ /*isElemental=*/false},
{"findloc",
&I::genFindloc,
{{{"array", asBox},
@@ -1004,6 +1016,14 @@ static constexpr IntrinsicHandler handlers[]{
{"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false},
{"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false},
{"time", &I::genTime, {}, /*isElemental=*/false},
+ {"tma_bulk_commit_group",
+ &I::genTMABulkCommitGroup,
+ {{}},
+ /*isElemental=*/false},
+ {"tma_bulk_wait_group",
+ &I::genTMABulkWaitGroup,
+ {{}},
+ /*isElemental=*/false},
{"trailz", &I::genTrailz},
{"transfer",
&I::genTransfer,
@@ -3180,20 +3200,59 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox);
}
-// BARRIER_INIT (CUDA)
-void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 2);
- auto llvmPtr = fir::ConvertOp::create(
+static mlir::Value convertBarrierToLLVM(fir::FirOpBuilder &builder,
+ mlir::Location loc,
+ mlir::Value barrier) {
+ mlir::Value llvmPtr = fir::ConvertOp::create(
builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
- fir::getBase(args[0]));
- auto addrCast = mlir::LLVM::AddrSpaceCastOp::create(
+ barrier);
+ mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
builder, loc,
mlir::LLVM::LLVMPointerType::get(
builder.getContext(),
static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Shared)),
llvmPtr);
- mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, addrCast,
+ return addrCast;
+}
+
+// BARRIER_ARRIVE (CUDA)
+mlir::Value
+IntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 1);
+ mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]);
+ return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
+ barrier)
+ .getResult();
+}
+
+// BARRIER_ARRIBVE_CNT (CUDA)
+mlir::Value
+IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]);
+ mlir::Value token = fir::AllocaOp::create(builder, loc, resultType);
+ // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and
+ // currently just the sink symbol `_`.
+ // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
+ mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1],
+ {});
+ return fir::LoadOp::create(builder, loc, token);
+}
+
+// BARRIER_INIT (CUDA)
+void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 2);
+ mlir::Value barrier =
+ convertBarrierToLLVM(builder, loc, fir::getBase(args[0]));
+ mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier,
fir::getBase(args[1]), {});
+ auto kind = mlir::NVVM::ProxyKindAttr::get(
+ builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+ auto space = mlir::NVVM::SharedSpaceAttr::get(
+ builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+ mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
}
// BESSEL_JN
@@ -4312,6 +4371,17 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType,
fir::getBase(args[1])));
}
+// FENCE_PROXY_ASYNC (CUDA)
+void IntrinsicLibrary::genFenceProxyAsync(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 0);
+ auto kind = mlir::NVVM::ProxyKindAttr::get(
+ builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+ auto space = mlir::NVVM::SharedSpaceAttr::get(
+ builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+ mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
// FINDLOC
fir::ExtendedValue
IntrinsicLibrary::genFindloc(mlir::Type resultType,
@@ -9127,6 +9197,21 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
fir::runtime::genTime(builder, loc));
}
+// TMA_BULK_COMMIT_GROUP (CUDA)
+void IntrinsicLibrary::genTMABulkCommitGroup(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 0);
+ mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
+}
+
+// TMA_BULK_WAIT_GROUP (CUDA)
+void IntrinsicLibrary::genTMABulkWaitGroup(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 0);
+ auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
+ mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
+}
+
// TRIM
fir::ExtendedValue
IntrinsicLibrary::genTrim(mlir::Type resultType,