diff options
Diffstat (limited to 'flang/lib/Optimizer/Builder/IntrinsicCall.cpp')
-rw-r--r-- | flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 58 |
1 files changed, 48 insertions, 10 deletions
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 7c5c5fb..e07baaf 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1020,6 +1020,17 @@ static constexpr IntrinsicHandler handlers[]{ &I::genTMABulkCommitGroup, {{}}, /*isElemental=*/false}, + {"tma_bulk_g2s", + &I::genTMABulkG2S, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nbytes", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_s2g", + &I::genTMABulkS2G, + {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, + /*isElemental=*/false}, {"tma_bulk_wait_group", &I::genTMABulkWaitGroup, {{}}, @@ -3200,17 +3211,17 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType, return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox); } -static mlir::Value convertBarrierToLLVM(fir::FirOpBuilder &builder, - mlir::Location loc, - mlir::Value barrier) { +static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value barrier, + mlir::NVVM::NVVMMemorySpace space) { mlir::Value llvmPtr = fir::ConvertOp::create( builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), barrier); mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( builder, loc, - mlir::LLVM::LLVMPointerType::get( - builder.getContext(), - static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Shared)), + mlir::LLVM::LLVMPointerType::get(builder.getContext(), + static_cast<unsigned>(space)), llvmPtr); return addrCast; } @@ -3220,7 +3231,8 @@ mlir::Value IntrinsicLibrary::genBarrierArrive(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { assert(args.size() == 1); - mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType, barrier) .getResult(); @@ -3231,7 +3243,8 @@ mlir::Value IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { assert(args.size() == 2); - mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); mlir::Value token = fir::AllocaOp::create(builder, loc, resultType); // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and // currently just the sink symbol `_`. @@ -3244,8 +3257,8 @@ IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, // BARRIER_INIT (CUDA) void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) { assert(args.size() == 2); - mlir::Value barrier = - convertBarrierToLLVM(builder, loc, fir::getBase(args[0])); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier, fir::getBase(args[1]), {}); auto kind = mlir::NVVM::ProxyKindAttr::get( @@ -9204,6 +9217,31 @@ void IntrinsicLibrary::genTMABulkCommitGroup( mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); } +// TMA_BULK_G2S (CUDA) +void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = + convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), + mlir::NVVM::NVVMMemorySpace::SharedCluster); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( + builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); +} + +// TMA_BULK_S2G (CUDA) +void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), + mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( + builder, loc, dst, src, fir::getBase(args[2]), {}, {}); +} + // TMA_BULK_WAIT_GROUP (CUDA) void IntrinsicLibrary::genTMABulkWaitGroup( llvm::ArrayRef<fir::ExtendedValue> args) { |