diff options
Diffstat (limited to 'flang/lib/Optimizer/Builder/IntrinsicCall.cpp')
-rw-r--r-- | flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 99 |
1 files changed, 92 insertions, 7 deletions
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 2c21868..7c5c5fb 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -346,6 +346,14 @@ static constexpr IntrinsicHandler handlers[]{ &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>, {{{"mask", asValue}, {"pred", asValue}}}, /*isElemental=*/false}, + {"barrier_arrive", + &I::genBarrierArrive, + {{{"barrier", asAddr}}}, + /*isElemental=*/false}, + {"barrier_arrive_cnt", + &I::genBarrierArriveCnt, + {{{"barrier", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, {"barrier_init", &I::genBarrierInit, {{{"barrier", asAddr}, {"count", asValue}}}, @@ -494,6 +502,10 @@ static constexpr IntrinsicHandler handlers[]{ &I::genExtendsTypeOf, {{{"a", asBox}, {"mold", asBox}}}, /*isElemental=*/false}, + {"fence_proxy_async", + &I::genFenceProxyAsync, + {}, + /*isElemental=*/false}, {"findloc", &I::genFindloc, {{{"array", asBox}, @@ -1004,6 +1016,14 @@ static constexpr IntrinsicHandler handlers[]{ {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, {"time", &I::genTime, {}, /*isElemental=*/false}, + {"tma_bulk_commit_group", + &I::genTMABulkCommitGroup, + {{}}, + /*isElemental=*/false}, + {"tma_bulk_wait_group", + &I::genTMABulkWaitGroup, + {{}}, + /*isElemental=*/false}, {"trailz", &I::genTrailz}, {"transfer", &I::genTransfer, @@ -3180,20 +3200,59 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType, return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox); } -// BARRIER_INIT (CUDA) -void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - auto llvmPtr = fir::ConvertOp::create( +static mlir::Value convertBarrierToLLVM(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value barrier) { + mlir::Value llvmPtr = fir::ConvertOp::create( builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), - fir::getBase(args[0])); - auto addrCast = mlir::LLVM::AddrSpaceCastOp::create( + barrier); + mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( builder, loc, mlir::LLVM::LLVMPointerType::get( builder.getContext(), static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Shared)), llvmPtr); - mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, addrCast, + return addrCast; +} + +// BARRIER_ARRIVE (CUDA) +mlir::Value +IntrinsicLibrary::genBarrierArrive(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 1); + mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]); + return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType, + barrier) + .getResult(); +} + +// BARRIER_ARRIBVE_CNT (CUDA) +mlir::Value +IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]); + mlir::Value token = fir::AllocaOp::create(builder, loc, resultType); + // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and + // currently just the sink symbol `_`. + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive + mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1], + {}); + return fir::LoadOp::create(builder, loc, token); +} + +// BARRIER_INIT (CUDA) +void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value barrier = + convertBarrierToLLVM(builder, loc, fir::getBase(args[0])); + mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier, fir::getBase(args[1]), {}); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); } // BESSEL_JN @@ -4312,6 +4371,17 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType, fir::getBase(args[1]))); } +// FENCE_PROXY_ASYNC (CUDA) +void IntrinsicLibrary::genFenceProxyAsync( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); +} + // FINDLOC fir::ExtendedValue IntrinsicLibrary::genFindloc(mlir::Type resultType, @@ -9127,6 +9197,21 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, fir::runtime::genTime(builder, loc)); } +// TMA_BULK_COMMIT_GROUP (CUDA) +void IntrinsicLibrary::genTMABulkCommitGroup( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); +} + +// TMA_BULK_WAIT_GROUP (CUDA) +void IntrinsicLibrary::genTMABulkWaitGroup( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + auto group = builder.getIntegerAttr(builder.getI32Type(), 0); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); +} + // TRIM fir::ExtendedValue IntrinsicLibrary::genTrim(mlir::Type resultType, |