//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Helper routines for constructing the FIR dialect of MLIR for PowerPC // intrinsics. Extensive use of MLIR interfaces and MLIR's coding style // (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this // module. // //===----------------------------------------------------------------------===// #include "flang/Optimizer/Builder/CUDAIntrinsicCall.h" #include "flang/Evaluate/common.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/MutableBox.h" #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" namespace fir { using CI = CUDAIntrinsicLibrary; static const char __ldca_i4x4[] = "__ldca_i4x4_"; static const char __ldca_i8x2[] = "__ldca_i8x2_"; static const char __ldca_r2x2[] = "__ldca_r2x2_"; static const char __ldca_r4x4[] = "__ldca_r4x4_"; static const char __ldca_r8x2[] = "__ldca_r8x2_"; static const char __ldcg_i4x4[] = "__ldcg_i4x4_"; static const char __ldcg_i8x2[] = "__ldcg_i8x2_"; static const char __ldcg_r2x2[] = "__ldcg_r2x2_"; static const char __ldcg_r4x4[] = "__ldcg_r4x4_"; static const char __ldcg_r8x2[] = "__ldcg_r8x2_"; static const char __ldcs_i4x4[] = "__ldcs_i4x4_"; static const char __ldcs_i8x2[] = "__ldcs_i8x2_"; static const char __ldcs_r2x2[] = "__ldcs_r2x2_"; static const char __ldcs_r4x4[] = "__ldcs_r4x4_"; static const char __ldcs_r8x2[] = "__ldcs_r8x2_"; static const char __ldcv_i4x4[] = "__ldcv_i4x4_"; static const char __ldcv_i8x2[] = "__ldcv_i8x2_"; static const char __ldcv_r2x2[] = "__ldcv_r2x2_"; static const char __ldcv_r4x4[] = "__ldcv_r4x4_"; static const char __ldcv_r8x2[] = "__ldcv_r8x2_"; static const char __ldlu_i4x4[] = "__ldlu_i4x4_"; static const char __ldlu_i8x2[] = "__ldlu_i8x2_"; static const char __ldlu_r2x2[] = "__ldlu_r2x2_"; static const char __ldlu_r4x4[] = "__ldlu_r4x4_"; static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; // CUDA specific intrinsic handlers. static constexpr IntrinsicHandler cudaHandlers[]{ {"__ldca_i4x4", static_cast( &CI::genLDXXFunc<__ldca_i4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldca_i8x2", static_cast( &CI::genLDXXFunc<__ldca_i8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldca_r2x2", static_cast( &CI::genLDXXFunc<__ldca_r2x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldca_r4x4", static_cast( &CI::genLDXXFunc<__ldca_r4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldca_r8x2", static_cast( &CI::genLDXXFunc<__ldca_r8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcg_i4x4", static_cast( &CI::genLDXXFunc<__ldcg_i4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcg_i8x2", static_cast( &CI::genLDXXFunc<__ldcg_i8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcg_r2x2", static_cast( &CI::genLDXXFunc<__ldcg_r2x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcg_r4x4", static_cast( &CI::genLDXXFunc<__ldcg_r4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcg_r8x2", static_cast( &CI::genLDXXFunc<__ldcg_r8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcs_i4x4", static_cast( &CI::genLDXXFunc<__ldcs_i4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcs_i8x2", static_cast( &CI::genLDXXFunc<__ldcs_i8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcs_r2x2", static_cast( &CI::genLDXXFunc<__ldcs_r2x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcs_r4x4", static_cast( &CI::genLDXXFunc<__ldcs_r4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcs_r8x2", static_cast( &CI::genLDXXFunc<__ldcs_r8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcv_i4x4", static_cast( &CI::genLDXXFunc<__ldcv_i4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcv_i8x2", static_cast( &CI::genLDXXFunc<__ldcv_i8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcv_r2x2", static_cast( &CI::genLDXXFunc<__ldcv_r2x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcv_r4x4", static_cast( &CI::genLDXXFunc<__ldcv_r4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldcv_r8x2", static_cast( &CI::genLDXXFunc<__ldcv_r8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldlu_i4x4", static_cast( &CI::genLDXXFunc<__ldlu_i4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldlu_i8x2", static_cast( &CI::genLDXXFunc<__ldlu_i8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldlu_r2x2", static_cast( &CI::genLDXXFunc<__ldlu_r2x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldlu_r4x4", static_cast( &CI::genLDXXFunc<__ldlu_r4x4, 4>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"__ldlu_r8x2", static_cast( &CI::genLDXXFunc<__ldlu_r8x2, 2>), {{{"a", asAddr}}}, /*isElemental=*/false}, {"all_sync", static_cast( &CI::genVoteSync), {{{"mask", asValue}, {"pred", asValue}}}, /*isElemental=*/false}, {"any_sync", static_cast( &CI::genVoteSync), {{{"mask", asValue}, {"pred", asValue}}}, /*isElemental=*/false}, {"atomicadd_r4x2", static_cast( &CI::genAtomicAddVector<2>), {{{"a", asAddr}, {"v", asAddr}}}, false}, {"atomicadd_r4x4", static_cast( &CI::genAtomicAddVector<4>), {{{"a", asAddr}, {"v", asAddr}}}, false}, {"atomicaddd", static_cast(&CI::genAtomicAdd), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddf", static_cast(&CI::genAtomicAdd), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddi", static_cast(&CI::genAtomicAdd), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddl", static_cast(&CI::genAtomicAdd), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddr2", static_cast(&CI::genAtomicAddR2), {{{"a", asAddr}, {"v", asAddr}}}, false}, {"atomicaddvector_r2x2", static_cast( &CI::genAtomicAddVector<2>), {{{"a", asAddr}, {"v", asAddr}}}, false}, {"atomicaddvector_r4x2", static_cast( &CI::genAtomicAddVector<2>), {{{"a", asAddr}, {"v", asAddr}}}, false}, {"atomicandi", static_cast(&CI::genAtomicAnd), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomiccasd", static_cast(&CI::genAtomicCas), {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, false}, {"atomiccasf", static_cast(&CI::genAtomicCas), {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, false}, {"atomiccasi", static_cast(&CI::genAtomicCas), {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, false}, {"atomiccasul", static_cast(&CI::genAtomicCas), {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, false}, {"atomicdeci", static_cast(&CI::genAtomicDec), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicexchd", static_cast(&CI::genAtomicExch), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicexchf", static_cast(&CI::genAtomicExch), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicexchi", static_cast(&CI::genAtomicExch), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicexchul", static_cast(&CI::genAtomicExch), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicinci", static_cast(&CI::genAtomicInc), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicmaxd", static_cast(&CI::genAtomicMax), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicmaxf", static_cast(&CI::genAtomicMax), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicmaxi", static_cast(&CI::genAtomicMax), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicmaxl", static_cast(&CI::genAtomicMax), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicmind", static_cast(&CI::genAtomicMin), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicminf", static_cast(&CI::genAtomicMin), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicmini", static_cast(&CI::genAtomicMin), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicminl", static_cast(&CI::genAtomicMin), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicori", static_cast(&CI::genAtomicOr), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicsubd", static_cast(&CI::genAtomicSub), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicsubf", static_cast(&CI::genAtomicSub), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicsubi", static_cast(&CI::genAtomicSub), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicsubl", static_cast(&CI::genAtomicSub), {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicxori", static_cast(&CI::genAtomicXor), {{{"a", asAddr}, {"v", asValue}}}, false}, {"ballot_sync", static_cast( &CI::genVoteSync), {{{"mask", asValue}, {"pred", asValue}}}, /*isElemental=*/false}, {"barrier_arrive", static_cast( &CI::genBarrierArrive), {{{"barrier", asAddr}}}, /*isElemental=*/false}, {"barrier_arrive_cnt", static_cast( &CI::genBarrierArriveCnt), {{{"barrier", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"barrier_init", static_cast( &CI::genBarrierInit), {{{"barrier", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"barrier_try_wait", static_cast( &CI::genBarrierTryWait), {{{"barrier", asAddr}, {"token", asValue}}}, /*isElemental=*/false}, {"barrier_try_wait_sleep", static_cast( &CI::genBarrierTryWaitSleep), {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, /*isElemental=*/false}, {"clock", static_cast( &CI::genNVVMTime), {}, /*isElemental=*/false}, {"clock64", static_cast( &CI::genNVVMTime), {}, /*isElemental=*/false}, {"fence_proxy_async", static_cast( &CI::genFenceProxyAsync), {}, /*isElemental=*/false}, {"globaltimer", static_cast( &CI::genNVVMTime), {}, /*isElemental=*/false}, {"match_all_syncjd", static_cast( &CI::genMatchAllSync), {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, /*isElemental=*/false}, {"match_all_syncjf", static_cast( &CI::genMatchAllSync), {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, /*isElemental=*/false}, {"match_all_syncjj", static_cast( &CI::genMatchAllSync), {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, /*isElemental=*/false}, {"match_all_syncjx", static_cast( &CI::genMatchAllSync), {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, /*isElemental=*/false}, {"match_any_syncjd", static_cast( &CI::genMatchAnySync), {{{"mask", asValue}, {"value", asValue}}}, /*isElemental=*/false}, {"match_any_syncjf", static_cast( &CI::genMatchAnySync), {{{"mask", asValue}, {"value", asValue}}}, /*isElemental=*/false}, {"match_any_syncjj", static_cast( &CI::genMatchAnySync), {{{"mask", asValue}, {"value", asValue}}}, /*isElemental=*/false}, {"match_any_syncjx", static_cast( &CI::genMatchAnySync), {{{"mask", asValue}, {"value", asValue}}}, /*isElemental=*/false}, {"syncthreads", static_cast( &CI::genSyncThreads), {}, /*isElemental=*/false}, {"syncthreads_and_i4", static_cast( &CI::genSyncThreadsAnd), {}, /*isElemental=*/false}, {"syncthreads_and_l4", static_cast( &CI::genSyncThreadsAnd), {}, /*isElemental=*/false}, {"syncthreads_count_i4", static_cast( &CI::genSyncThreadsCount), {}, /*isElemental=*/false}, {"syncthreads_count_l4", static_cast( &CI::genSyncThreadsCount), {}, /*isElemental=*/false}, {"syncthreads_or_i4", static_cast( &CI::genSyncThreadsOr), {}, /*isElemental=*/false}, {"syncthreads_or_l4", static_cast( &CI::genSyncThreadsOr), {}, /*isElemental=*/false}, {"syncwarp", static_cast(&CI::genSyncWarp), {}, /*isElemental=*/false}, {"this_grid", static_cast(&CI::genThisGrid), {}, /*isElemental=*/false}, {"this_thread_block", static_cast( &CI::genThisThreadBlock), {}, /*isElemental=*/false}, {"this_warp", static_cast(&CI::genThisWarp), {}, /*isElemental=*/false}, {"threadfence", static_cast( &CI::genThreadFence), {}, /*isElemental=*/false}, {"threadfence_block", static_cast( &CI::genThreadFenceBlock), {}, /*isElemental=*/false}, {"threadfence_system", static_cast( &CI::genThreadFenceSystem), {}, /*isElemental=*/false}, {"tma_bulk_commit_group", static_cast( &CI::genTMABulkCommitGroup), {{}}, /*isElemental=*/false}, {"tma_bulk_g2s", static_cast(&CI::genTMABulkG2S), {{{"barrier", asAddr}, {"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, /*isElemental=*/false}, {"tma_bulk_ldc4", static_cast( &CI::genTMABulkLoadC4), {{{"barrier", asAddr}, {"src", asAddr}, {"dst", asAddr}, {"nelems", asValue}}}, /*isElemental=*/false}, {"tma_bulk_ldc8", static_cast( &CI::genTMABulkLoadC8), {{{"barrier", asAddr}, {"src", asAddr}, {"dst", asAddr}, {"nelems", asValue}}}, /*isElemental=*/false}, {"tma_bulk_ldi4", static_cast( &CI::genTMABulkLoadI4), {{{"barrier", asAddr}, {"src", asAddr}, {"dst", asAddr}, {"nelems", asValue}}}, /*isElemental=*/false}, {"tma_bulk_ldi8", static_cast( &CI::genTMABulkLoadI8), {{{"barrier", asAddr}, {"src", asAddr}, {"dst", asAddr}, {"nelems", asValue}}}, /*isElemental=*/false}, {"tma_bulk_ldr2", static_cast( &CI::genTMABulkLoadR2), {{{"barrier", asAddr}, {"src", asAddr}, {"dst", asAddr}, {"nelems", asValue}}}, /*isElemental=*/false}, {"tma_bulk_ldr4", static_cast( &CI::genTMABulkLoadR4), {{{"barrier", asAddr}, {"src", asAddr}, {"dst", asAddr}, {"nelems", asValue}}}, /*isElemental=*/false}, {"tma_bulk_ldr8", static_cast( &CI::genTMABulkLoadR8), {{{"barrier", asAddr}, {"src", asAddr}, {"dst", asAddr}, {"nelems", asValue}}}, /*isElemental=*/false}, {"tma_bulk_s2g", static_cast(&CI::genTMABulkS2G), {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, /*isElemental=*/false}, {"tma_bulk_store_c4", static_cast( &CI::genTMABulkStoreC4), {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"tma_bulk_store_c8", static_cast( &CI::genTMABulkStoreC8), {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"tma_bulk_store_i4", static_cast( &CI::genTMABulkStoreI4), {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"tma_bulk_store_i8", static_cast( &CI::genTMABulkStoreI8), {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"tma_bulk_store_r2", static_cast( &CI::genTMABulkStoreR2), {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"tma_bulk_store_r4", static_cast( &CI::genTMABulkStoreR4), {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"tma_bulk_store_r8", static_cast( &CI::genTMABulkStoreR8), {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, /*isElemental=*/false}, {"tma_bulk_wait_group", static_cast( &CI::genTMABulkWaitGroup), {{}}, /*isElemental=*/false}, }; template static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) { // Replace by std::sorted when C++20 is default (will be constexpr). const IntrinsicHandler *lastSeen{nullptr}; bool isSorted{true}; for (const auto &x : array) { if (lastSeen) isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name}; lastSeen = &x; } return isSorted; } static_assert(isSorted(cudaHandlers) && "map must be sorted"); const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) { auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) { return name.compare(cudaHandler.name) > 0; }; auto result = llvm::lower_bound(cudaHandlers, name, compare); return result != std::end(cudaHandlers) && result->name == name ? result : nullptr; } static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value barrier, mlir::NVVM::NVVMMemorySpace space) { mlir::Value llvmPtr = fir::ConvertOp::create( builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), barrier); mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext(), static_cast(space)), llvmPtr); return addrCast; } static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, mlir::Value arg1) { auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); arg0 = builder.createConvert(loc, llvmPointerType, arg0); return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1, mlir::LLVM::AtomicOrdering::seq_cst); } // ATOMICADD mlir::Value CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::LLVM::AtomicBinOp binOp = mlir::isa(args[1].getType()) ? mlir::LLVM::AtomicBinOp::add : mlir::LLVM::AtomicBinOp::fadd; return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value a = fir::getBase(args[0]); if (mlir::isa(a.getType())) { a = fir::BoxAddrOp::create(builder, loc, a); } auto loc = builder.getUnknownLoc(); auto f16Ty = builder.getF16Type(); auto i32Ty = builder.getI32Type(); auto vecF16Ty = mlir::VectorType::get({2}, f16Ty); mlir::Type idxTy = builder.getIndexType(); auto f16RefTy = fir::ReferenceType::get(f16Ty); auto zero = builder.createIntegerConstant(loc, idxTy, 0); auto one = builder.createIntegerConstant(loc, idxTy, 1); auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, fir::getBase(args[1]), zero); auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, fir::getBase(args[1]), one); auto v1 = fir::LoadOp::create(builder, loc, v1Coord); auto v2 = fir::LoadOp::create(builder, loc, v2Coord); mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty); mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); auto i32VecTy = mlir::VectorType::get({1}, i32Ty); mlir::Value vecI32 = mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res); return mlir::vector::ExtractOp::create(builder, loc, vecI32, mlir::ArrayRef{0}); } // ATOMICADDVECTOR template fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector( mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value res = fir::AllocaOp::create( builder, loc, fir::SequenceType::get({extent}, resultType)); mlir::Value a = fir::getBase(args[0]); if (mlir::isa(a.getType())) { a = fir::BoxAddrOp::create(builder, loc, a); } auto vecTy = mlir::VectorType::get({extent}, resultType); auto refTy = fir::ReferenceType::get(resultType); mlir::Type i32Ty = builder.getI32Type(); mlir::Type idxTy = builder.getIndexType(); // Extract the values from the array. llvm::SmallVector values; for (unsigned i = 0; i < extent; ++i) { mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i); mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy, fir::getBase(args[1]), pos); mlir::Value value = fir::LoadOp::create(builder, loc, coord); values.push_back(value); } // Pack extracted values into a vector to call the atomic add. mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); for (unsigned i = 0; i < extent; ++i) { mlir::Value insert = mlir::LLVM::InsertElementOp::create( builder, loc, undef, values[i], builder.createIntegerConstant(loc, i32Ty, i)); undef = insert; } // Atomic operation with a vector of values. mlir::Value add = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef); // Store results in the result array. for (unsigned i = 0; i < extent; ++i) { mlir::Value r = mlir::LLVM::ExtractElementOp::create( builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i)); mlir::Value c = fir::CoordinateOp::create( builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i)); fir::StoreOp::create(builder, loc, r, c); } mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent); return fir::ArrayBoxValue(res, {ext}); } mlir::Value CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); assert(mlir::isa(args[1].getType())); mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and; return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } mlir::Value CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); assert(mlir::isa(args[1].getType())); mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or; return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } // ATOMICCAS fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 3); auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel; auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic; auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext()); mlir::Value arg0 = fir::getBase(args[0]); mlir::Value arg1 = fir::getBase(args[1]); mlir::Value arg2 = fir::getBase(args[2]); auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value { if (mlir::isa(arg.getType())) return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(), arg); if (mlir::isa(arg.getType())) return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(), arg); return arg; }; arg1 = bitCastFloat(arg1); arg2 = bitCastFloat(arg2); if (arg1.getType() != arg2.getType()) { // arg1 and arg2 need to have the same type in AtomicCmpXchgOp. arg2 = builder.createConvert(loc, arg1.getType(), arg2); } auto address = mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0) .getResult(0); auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create( builder, loc, address, arg1, arg2, successOrdering, failureOrdering); mlir::Value boolResult = mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1); return builder.createConvert(loc, resultType, boolResult); } mlir::Value CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); assert(mlir::isa(args[1].getType())); mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap; return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } // ATOMICEXCH fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value arg0 = fir::getBase(args[0]); mlir::Value arg1 = fir::getBase(args[1]); assert(arg1.getType().isIntOrFloat()); mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg; return genAtomBinOp(builder, loc, binOp, arg0, arg1); } mlir::Value CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); assert(mlir::isa(args[1].getType())); mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap; return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } mlir::Value CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::LLVM::AtomicBinOp binOp = mlir::isa(args[1].getType()) ? mlir::LLVM::AtomicBinOp::max : mlir::LLVM::AtomicBinOp::fmax; return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } mlir::Value CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::LLVM::AtomicBinOp binOp = mlir::isa(args[1].getType()) ? mlir::LLVM::AtomicBinOp::min : mlir::LLVM::AtomicBinOp::fmin; return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } // ATOMICSUB mlir::Value CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::LLVM::AtomicBinOp binOp = mlir::isa(args[1].getType()) ? mlir::LLVM::AtomicBinOp::sub : mlir::LLVM::AtomicBinOp::fsub; return genAtomBinOp(builder, loc, binOp, args[0], args[1]); } // ATOMICXOR fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value arg0 = fir::getBase(args[0]); mlir::Value arg1 = fir::getBase(args[1]); return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1); } // BARRIER_ARRIVE mlir::Value CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 1); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier) .getResult(); } // BARRIER_ARRIBVE_CNT mlir::Value CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, {barrier, args[1]}, {}, "mbarrier.arrive.expect_tx.release." "cta.shared::cta.b64 %0, [%1], %2;", {}) .getResult(0); } // BARRIER_INIT void CUDAIntrinsicLibrary::genBarrierInit( llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, fir::getBase(args[1]), {}); auto kind = mlir::NVVM::ProxyKindAttr::get( builder.getContext(), mlir::NVVM::ProxyKind::async_shared); auto space = mlir::NVVM::SharedSpaceAttr::get( builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); } // BARRIER_TRY_WAIT mlir::Value CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); fir::StoreOp::create(builder, loc, zero, res); mlir::Value ns = builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); mlir::Value load = fir::LoadOp::create(builder, loc, res); auto whileOp = mlir::scf::WhileOp::create( builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); builder.setInsertionPointToStart(beforeBlock); mlir::Value condition = mlir::arith::CmpIOp::create( builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); afterBlock->addArgument(resultType, loc); builder.setInsertionPointToStart(afterBlock); auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); mlir::Value ret = mlir::NVVM::InlinePtxOp::create( builder, loc, {resultType}, {barrier, args[1], ns}, {}, "{\n" " .reg .pred p;\n" " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" " selp.b32 %0, 1, 0, p;\n" "}", {}) .getResult(0); mlir::scf::YieldOp::create(builder, loc, ret); builder.setInsertionPointAfter(whileOp); return whileOp.getResult(0); } // BARRIER_TRY_WAIT_SLEEP mlir::Value CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 3); auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); return mlir::NVVM::InlinePtxOp::create( builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, "{\n" " .reg .pred p;\n" " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" " selp.b32 %0, 1, 0, p;\n" "}", {}) .getResult(0); } // FENCE_PROXY_ASYNC void CUDAIntrinsicLibrary::genFenceProxyAsync( llvm::ArrayRef args) { assert(args.size() == 0); auto kind = mlir::NVVM::ProxyKindAttr::get( builder.getContext(), mlir::NVVM::ProxyKind::async_shared); auto space = mlir::NVVM::SharedSpaceAttr::get( builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); } // __LDCA, __LDCS, __LDLU, __LDCV template fir::ExtendedValue CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 1); mlir::Type resTy = fir::SequenceType::get(extent, resultType); mlir::Value arg = fir::getBase(args[0]); mlir::Value res = fir::AllocaOp::create(builder, loc, resTy); if (mlir::isa(arg.getType())) arg = fir::BoxAddrOp::create(builder, loc, arg); mlir::Type refResTy = fir::ReferenceType::get(resTy); mlir::FunctionType ftype = mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {}); auto funcOp = builder.createFunction(loc, fctName, ftype); llvm::SmallVector funcArgs; funcArgs.push_back(res); funcArgs.push_back(arg); fir::CallOp::create(builder, loc, funcOp, funcArgs); mlir::Value ext = builder.createIntegerConstant(loc, builder.getIndexType(), extent); return fir::ArrayBoxValue(res, {ext}); } // CLOCK, CLOCK64, GLOBALTIMER template mlir::Value CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 0 && "expect no arguments"); return OpTy::create(builder, loc, resultType).getResult(); } // MATCH_ALL_SYNC mlir::Value CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 3); bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); mlir::Type i1Ty = builder.getI1Type(); mlir::MLIRContext *context = builder.getContext(); mlir::Value arg1 = args[1]; if (arg1.getType().isF32() || arg1.getType().isF64()) arg1 = fir::ConvertOp::create( builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); mlir::Type retTy = mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); auto match = mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1, mlir::NVVM::MatchSyncKind::all) .getResult(); auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0); auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1); auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred); fir::StoreOp::create(builder, loc, conv, args[2]); return value; } // MATCH_ANY_SYNC mlir::Value CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); mlir::Value arg1 = args[1]; if (arg1.getType().isF32() || arg1.getType().isF64()) arg1 = fir::ConvertOp::create( builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0], arg1, mlir::NVVM::MatchSyncKind::any) .getResult(); } // SYNCTHREADS void CUDAIntrinsicLibrary::genSyncThreads( llvm::ArrayRef args) { mlir::NVVM::Barrier0Op::create(builder, loc); } // SYNCTHREADS_AND mlir::Value CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, llvm::ArrayRef args) { constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; mlir::MLIRContext *context = builder.getContext(); mlir::Type i32 = builder.getI32Type(); mlir::FunctionType ftype = mlir::FunctionType::get(context, {resultType}, {i32}); auto funcOp = builder.createFunction(loc, funcName, ftype); mlir::Value arg = builder.createConvert(loc, i32, args[0]); return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); } // SYNCTHREADS_COUNT mlir::Value CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, llvm::ArrayRef args) { constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; mlir::MLIRContext *context = builder.getContext(); mlir::Type i32 = builder.getI32Type(); mlir::FunctionType ftype = mlir::FunctionType::get(context, {resultType}, {i32}); auto funcOp = builder.createFunction(loc, funcName, ftype); mlir::Value arg = builder.createConvert(loc, i32, args[0]); return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); } // SYNCTHREADS_OR mlir::Value CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, llvm::ArrayRef args) { constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; mlir::MLIRContext *context = builder.getContext(); mlir::Type i32 = builder.getI32Type(); mlir::FunctionType ftype = mlir::FunctionType::get(context, {resultType}, {i32}); auto funcOp = builder.createFunction(loc, funcName, ftype); mlir::Value arg = builder.createConvert(loc, i32, args[0]); return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); } // SYNCWARP void CUDAIntrinsicLibrary::genSyncWarp( llvm::ArrayRef args) { assert(args.size() == 1); mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0])); } // THIS_GRID mlir::Value CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 0); auto recTy = mlir::cast(resultType); assert(recTy && "RecordType expepected"); mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); mlir::Type i32Ty = builder.getI32Type(); mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty); mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty); mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty); mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty); mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty); mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty); // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) * // (blockDim.x * gridDim.x); mlir::Value resZ = mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ); mlir::Value resY = mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY); mlir::Value resX = mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX); mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY); mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX); // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) + // blockIdx.x; // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) + // ((threadIdx.z * blockDim.y) * blockDim.x) + // (threadIdx.y * blockDim.x) + threadIdx.x + 1; mlir::Value r1 = mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY); mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX); mlir::Value r3 = mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX); mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX); mlir::Value bXbY = mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY); mlir::Value bXbYbZ = mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ); mlir::Value tZbY = mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); mlir::Value tZbYbX = mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX); mlir::Value tYbX = mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ); rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX); rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX); rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX); mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); rank = mlir::arith::AddIOp::create(builder, loc, rank, one); auto sizeFieldName = recTy.getTypeList()[1].first; mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( builder, loc, fieldIndexType, sizeFieldName, recTy, /*typeParams=*/mlir::ValueRange{}); mlir::Value sizeCoord = fir::CoordinateOp::create( builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); fir::StoreOp::create(builder, loc, size, sizeCoord); auto rankFieldName = recTy.getTypeList()[2].first; mlir::Type rankFieldTy = recTy.getTypeList()[2].second; mlir::Value rankFieldIndex = fir::FieldIndexOp::create( builder, loc, fieldIndexType, rankFieldName, recTy, /*typeParams=*/mlir::ValueRange{}); mlir::Value rankCoord = fir::CoordinateOp::create( builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); fir::StoreOp::create(builder, loc, rank, rankCoord); return res; } // THIS_THREAD_BLOCK mlir::Value CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 0); auto recTy = mlir::cast(resultType); assert(recTy && "RecordType expepected"); mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); mlir::Type i32Ty = builder.getI32Type(); // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x; mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); mlir::Value size = mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY); size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX); // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) + // (threadIdx.y * blockDim.x) + threadIdx.x + 1; mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); mlir::Value r1 = mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX); mlir::Value r3 = mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX); mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); rank = mlir::arith::AddIOp::create(builder, loc, rank, one); auto sizeFieldName = recTy.getTypeList()[1].first; mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( builder, loc, fieldIndexType, sizeFieldName, recTy, /*typeParams=*/mlir::ValueRange{}); mlir::Value sizeCoord = fir::CoordinateOp::create( builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); fir::StoreOp::create(builder, loc, size, sizeCoord); auto rankFieldName = recTy.getTypeList()[2].first; mlir::Type rankFieldTy = recTy.getTypeList()[2].second; mlir::Value rankFieldIndex = fir::FieldIndexOp::create( builder, loc, fieldIndexType, rankFieldName, recTy, /*typeParams=*/mlir::ValueRange{}); mlir::Value rankCoord = fir::CoordinateOp::create( builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); fir::StoreOp::create(builder, loc, rank, rankCoord); return res; } // THIS_WARP mlir::Value CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 0); auto recTy = mlir::cast(resultType); assert(recTy && "RecordType expepected"); mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); mlir::Type i32Ty = builder.getI32Type(); // coalesced_group%size = 32 mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32); auto sizeFieldName = recTy.getTypeList()[1].first; mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( builder, loc, fieldIndexType, sizeFieldName, recTy, /*typeParams=*/mlir::ValueRange{}); mlir::Value sizeCoord = fir::CoordinateOp::create( builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); fir::StoreOp::create(builder, loc, size, sizeCoord); // coalesced_group%rank = threadIdx.x & 31 + 1 mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31); mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); mlir::Value masked = mlir::arith::AndIOp::create(builder, loc, threadIdX, mask); mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one); auto rankFieldName = recTy.getTypeList()[2].first; mlir::Type rankFieldTy = recTy.getTypeList()[2].second; mlir::Value rankFieldIndex = fir::FieldIndexOp::create( builder, loc, fieldIndexType, rankFieldName, recTy, /*typeParams=*/mlir::ValueRange{}); mlir::Value rankCoord = fir::CoordinateOp::create( builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); fir::StoreOp::create(builder, loc, rank, rankCoord); return res; } // THREADFENCE void CUDAIntrinsicLibrary::genThreadFence( llvm::ArrayRef args) { constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; mlir::FunctionType funcType = mlir::FunctionType::get(builder.getContext(), {}, {}); auto funcOp = builder.createFunction(loc, funcName, funcType); llvm::SmallVector noArgs; fir::CallOp::create(builder, loc, funcOp, noArgs); } // THREADFENCE_BLOCK void CUDAIntrinsicLibrary::genThreadFenceBlock( llvm::ArrayRef args) { constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; mlir::FunctionType funcType = mlir::FunctionType::get(builder.getContext(), {}, {}); auto funcOp = builder.createFunction(loc, funcName, funcType); llvm::SmallVector noArgs; fir::CallOp::create(builder, loc, funcOp, noArgs); } // THREADFENCE_SYSTEM void CUDAIntrinsicLibrary::genThreadFenceSystem( llvm::ArrayRef args) { constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; mlir::FunctionType funcType = mlir::FunctionType::get(builder.getContext(), {}, {}); auto funcOp = builder.createFunction(loc, funcName, funcType); llvm::SmallVector noArgs; fir::CallOp::create(builder, loc, funcOp, noArgs); } // TMA_BULK_COMMIT_GROUP void CUDAIntrinsicLibrary::genTMABulkCommitGroup( llvm::ArrayRef args) { assert(args.size() == 0); mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); } // TMA_BULK_G2S void CUDAIntrinsicLibrary::genTMABulkG2S( llvm::ArrayRef args) { assert(args.size() == 4); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), mlir::NVVM::NVVMMemorySpace::SharedCluster); mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), mlir::NVVM::NVVMMemorySpace::Global); mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); } static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value barrier, mlir::Value src, mlir::Value dst, mlir::Value nelem, mlir::Value eleSize) { mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); barrier = builder.createConvert(loc, llvmPtrTy, barrier); dst = builder.createConvert(loc, llvmPtrTy, dst); src = builder.createConvert(loc, llvmPtrTy, src); mlir::NVVM::InlinePtxOp::create( builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " "[%1], %2, [%3];", {}); mlir::NVVM::InlinePtxOp::create( builder, loc, mlir::TypeRange{}, {barrier, size}, {}, "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); } // TMA_BULK_LOADC4 void CUDAIntrinsicLibrary::genTMABulkLoadC4( llvm::ArrayRef args) { assert(args.size() == 4); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 8); genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), fir::getBase(args[3]), eleSize); } // TMA_BULK_LOADC8 void CUDAIntrinsicLibrary::genTMABulkLoadC8( llvm::ArrayRef args) { assert(args.size() == 4); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 16); genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), fir::getBase(args[3]), eleSize); } // TMA_BULK_LOADI4 void CUDAIntrinsicLibrary::genTMABulkLoadI4( llvm::ArrayRef args) { assert(args.size() == 4); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 4); genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), fir::getBase(args[3]), eleSize); } // TMA_BULK_LOADI8 void CUDAIntrinsicLibrary::genTMABulkLoadI8( llvm::ArrayRef args) { assert(args.size() == 4); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 8); genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), fir::getBase(args[3]), eleSize); } // TMA_BULK_LOADR2 void CUDAIntrinsicLibrary::genTMABulkLoadR2( llvm::ArrayRef args) { assert(args.size() == 4); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 2); genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), fir::getBase(args[3]), eleSize); } // TMA_BULK_LOADR4 void CUDAIntrinsicLibrary::genTMABulkLoadR4( llvm::ArrayRef args) { assert(args.size() == 4); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 4); genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), fir::getBase(args[3]), eleSize); } // TMA_BULK_LOADR8 void CUDAIntrinsicLibrary::genTMABulkLoadR8( llvm::ArrayRef args) { assert(args.size() == 4); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 8); genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), fir::getBase(args[3]), eleSize); } // TMA_BULK_S2G void CUDAIntrinsicLibrary::genTMABulkS2G( llvm::ArrayRef args) { assert(args.size() == 3); mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), mlir::NVVM::NVVMMemorySpace::Global); mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( builder, loc, dst, src, fir::getBase(args[2]), {}, {}); mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, "cp.async.bulk.commit_group;", {}); mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, builder.getI32IntegerAttr(0), {}); } static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value src, mlir::Value dst, mlir::Value count, mlir::Value eleSize) { mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); src = convertPtrToNVVMSpace(builder, loc, src, mlir::NVVM::NVVMMemorySpace::Shared); dst = convertPtrToNVVMSpace(builder, loc, dst, mlir::NVVM::NVVMMemorySpace::Global); mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, size, {}, {}); mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, "cp.async.bulk.commit_group;", {}); mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, builder.getI32IntegerAttr(0), {}); } // TMA_BULK_STORE_C4 void CUDAIntrinsicLibrary::genTMABulkStoreC4( llvm::ArrayRef args) { assert(args.size() == 3); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 8); genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), eleSize); } // TMA_BULK_STORE_C8 void CUDAIntrinsicLibrary::genTMABulkStoreC8( llvm::ArrayRef args) { assert(args.size() == 3); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 16); genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), eleSize); } // TMA_BULK_STORE_I4 void CUDAIntrinsicLibrary::genTMABulkStoreI4( llvm::ArrayRef args) { assert(args.size() == 3); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 4); genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), eleSize); } // TMA_BULK_STORE_I8 void CUDAIntrinsicLibrary::genTMABulkStoreI8( llvm::ArrayRef args) { assert(args.size() == 3); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 8); genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), eleSize); } // TMA_BULK_STORE_R2 void CUDAIntrinsicLibrary::genTMABulkStoreR2( llvm::ArrayRef args) { assert(args.size() == 3); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 2); genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), eleSize); } // TMA_BULK_STORE_R4 void CUDAIntrinsicLibrary::genTMABulkStoreR4( llvm::ArrayRef args) { assert(args.size() == 3); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 4); genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), eleSize); } // TMA_BULK_STORE_R8 void CUDAIntrinsicLibrary::genTMABulkStoreR8( llvm::ArrayRef args) { assert(args.size() == 3); mlir::Value eleSize = builder.createIntegerConstant(loc, builder.getI32Type(), 8); genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), fir::getBase(args[2]), eleSize); } // TMA_BULK_WAIT_GROUP void CUDAIntrinsicLibrary::genTMABulkWaitGroup( llvm::ArrayRef args) { assert(args.size() == 0); auto group = builder.getIntegerAttr(builder.getI32Type(), 0); mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); } // ALL_SYNC, ANY_SYNC, BALLOT_SYNC template mlir::Value CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value arg1 = fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]); mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot ? builder.getI32Type() : builder.getI1Type(); auto voteRes = mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind) .getResult(); return fir::ConvertOp::create(builder, loc, resultType, voteRes); } } // namespace fir