aboutsummaryrefslogtreecommitdiff
path: root/flang/lib/Optimizer/Builder
diff options
context:
space:
mode:
Diffstat (limited to 'flang/lib/Optimizer/Builder')
-rw-r--r--flang/lib/Optimizer/Builder/CMakeLists.txt1
-rw-r--r--flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp1722
-rw-r--r--flang/lib/Optimizer/Builder/CUFCommon.cpp64
-rw-r--r--flang/lib/Optimizer/Builder/FIRBuilder.cpp37
-rw-r--r--flang/lib/Optimizer/Builder/HLFIRTools.cpp70
-rw-r--r--flang/lib/Optimizer/Builder/IntrinsicCall.cpp1368
-rw-r--r--flang/lib/Optimizer/Builder/Runtime/Character.cpp23
-rw-r--r--flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp40
-rw-r--r--flang/lib/Optimizer/Builder/Runtime/Reduction.cpp2
-rw-r--r--flang/lib/Optimizer/Builder/TemporaryStorage.cpp8
10 files changed, 2067 insertions, 1268 deletions
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 1f95259..37c9c2d 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -5,6 +5,7 @@ add_flang_library(FIRBuilder
BoxValue.cpp
Character.cpp
Complex.cpp
+ CUDAIntrinsicCall.cpp
CUFCommon.cpp
DoLoopHelper.cpp
FIRBuilder.cpp
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
new file mode 100644
index 0000000..3c86a9d
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -0,0 +1,1722 @@
+//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper routines for constructing the FIR dialect of MLIR for PowerPC
+// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style
+// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this
+// module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
+#include "flang/Evaluate/common.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/MutableBox.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+namespace fir {
+
+using CI = CUDAIntrinsicLibrary;
+
+static const char __ldca_i4x4[] = "__ldca_i4x4_";
+static const char __ldca_i8x2[] = "__ldca_i8x2_";
+static const char __ldca_r2x2[] = "__ldca_r2x2_";
+static const char __ldca_r4x4[] = "__ldca_r4x4_";
+static const char __ldca_r8x2[] = "__ldca_r8x2_";
+static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
+static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
+static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
+static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
+static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
+static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
+static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
+static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
+static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
+static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
+static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
+static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
+static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
+static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
+static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
+static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
+static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
+static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
+static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
+static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
+
+static constexpr unsigned kTMAAlignment = 16;
+
+// CUDA specific intrinsic handlers.
+static constexpr IntrinsicHandler cudaHandlers[]{
+ {"__ldca_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_r2x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_r4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldca_r8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldca_r8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_r2x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_r4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcg_r8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcg_r8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_r2x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_r4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcs_r8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcs_r8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcv_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcv_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcv_r2x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcv_r4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldcv_r8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldcv_r8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_i4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldlu_i4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_i8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldlu_i8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldlu_r2x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldlu_r4x4, 4>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"__ldlu_r8x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genLDXXFunc<__ldlu_r8x2, 2>),
+ {{{"a", asAddr}}},
+ /*isElemental=*/false},
+ {"all_sync",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>),
+ {{{"mask", asValue}, {"pred", asValue}}},
+ /*isElemental=*/false},
+ {"any_sync",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>),
+ {{{"mask", asValue}, {"pred", asValue}}},
+ /*isElemental=*/false},
+ {"atomicadd_r4x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genAtomicAddVector<2>),
+ {{{"a", asAddr}, {"v", asAddr}}},
+ false},
+ {"atomicadd_r4x4",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genAtomicAddVector4x4),
+ {{{"a", asAddr}, {"v", asAddr}}},
+ false},
+ {"atomicaddd",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicaddf",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicaddi",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicaddl",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicaddr2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2),
+ {{{"a", asAddr}, {"v", asAddr}}},
+ false},
+ {"atomicaddvector_r2x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genAtomicAddVector<2>),
+ {{{"a", asAddr}, {"v", asAddr}}},
+ false},
+ {"atomicaddvector_r4x2",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+ &CI::genAtomicAddVector<2>),
+ {{{"a", asAddr}, {"v", asAddr}}},
+ false},
+ {"atomicandi",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomiccasd",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+ {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+ false},
+ {"atomiccasf",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+ {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+ false},
+ {"atomiccasi",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+ {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+ false},
+ {"atomiccasul",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+ {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+ false},
+ {"atomicdeci",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicexchd",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicexchf",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicexchi",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicexchul",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicinci",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicmaxd",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicmaxf",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicmaxi",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicmaxl",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicmind",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicminf",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicmini",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicminl",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicori",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicsubd",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicsubf",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicsubi",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicsubl",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"atomicxori",
+ static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor),
+ {{{"a", asAddr}, {"v", asValue}}},
+ false},
+ {"ballot_sync",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>),
+ {{{"mask", asValue}, {"pred", asValue}}},
+ /*isElemental=*/false},
+ {"barrier_arrive",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genBarrierArrive),
+ {{{"barrier", asAddr}}},
+ /*isElemental=*/false},
+ {"barrier_arrive_cnt",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genBarrierArriveCnt),
+ {{{"barrier", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"barrier_init",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genBarrierInit),
+ {{{"barrier", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"barrier_try_wait",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genBarrierTryWait),
+ {{{"barrier", asAddr}, {"token", asValue}}},
+ /*isElemental=*/false},
+ {"barrier_try_wait_sleep",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genBarrierTryWaitSleep),
+ {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
+ /*isElemental=*/false},
+ {"clock",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genNVVMTime<mlir::NVVM::ClockOp>),
+ {},
+ /*isElemental=*/false},
+ {"clock64",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genNVVMTime<mlir::NVVM::Clock64Op>),
+ {},
+ /*isElemental=*/false},
+ {"cluster_block_index",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genClusterBlockIndex),
+ {},
+ /*isElemental=*/false},
+ {"cluster_dim_blocks",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genClusterDimBlocks),
+ {},
+ /*isElemental=*/false},
+ {"fence_proxy_async",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genFenceProxyAsync),
+ {},
+ /*isElemental=*/false},
+ {"globaltimer",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>),
+ {},
+ /*isElemental=*/false},
+ {"match_all_syncjd",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genMatchAllSync),
+ {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+ /*isElemental=*/false},
+ {"match_all_syncjf",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genMatchAllSync),
+ {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+ /*isElemental=*/false},
+ {"match_all_syncjj",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genMatchAllSync),
+ {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+ /*isElemental=*/false},
+ {"match_all_syncjx",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genMatchAllSync),
+ {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+ /*isElemental=*/false},
+ {"match_any_syncjd",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genMatchAnySync),
+ {{{"mask", asValue}, {"value", asValue}}},
+ /*isElemental=*/false},
+ {"match_any_syncjf",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genMatchAnySync),
+ {{{"mask", asValue}, {"value", asValue}}},
+ /*isElemental=*/false},
+ {"match_any_syncjj",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genMatchAnySync),
+ {{{"mask", asValue}, {"value", asValue}}},
+ /*isElemental=*/false},
+ {"match_any_syncjx",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genMatchAnySync),
+ {{{"mask", asValue}, {"value", asValue}}},
+ /*isElemental=*/false},
+ {"syncthreads",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genSyncThreads),
+ {},
+ /*isElemental=*/false},
+ {"syncthreads_and_i4",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genSyncThreadsAnd),
+ {},
+ /*isElemental=*/false},
+ {"syncthreads_and_l4",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genSyncThreadsAnd),
+ {},
+ /*isElemental=*/false},
+ {"syncthreads_count_i4",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genSyncThreadsCount),
+ {},
+ /*isElemental=*/false},
+ {"syncthreads_count_l4",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genSyncThreadsCount),
+ {},
+ /*isElemental=*/false},
+ {"syncthreads_or_i4",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genSyncThreadsOr),
+ {},
+ /*isElemental=*/false},
+ {"syncthreads_or_l4",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genSyncThreadsOr),
+ {},
+ /*isElemental=*/false},
+ {"syncwarp",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp),
+ {},
+ /*isElemental=*/false},
+ {"this_cluster",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisCluster),
+ {},
+ /*isElemental=*/false},
+ {"this_grid",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid),
+ {},
+ /*isElemental=*/false},
+ {"this_thread_block",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+ &CI::genThisThreadBlock),
+ {},
+ /*isElemental=*/false},
+ {"this_warp",
+ static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp),
+ {},
+ /*isElemental=*/false},
+ {"threadfence",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genThreadFence<mlir::NVVM::MemScopeKind::GPU>),
+ {},
+ /*isElemental=*/false},
+ {"threadfence_block",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genThreadFence<mlir::NVVM::MemScopeKind::CTA>),
+ {},
+ /*isElemental=*/false},
+ {"threadfence_system",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genThreadFence<mlir::NVVM::MemScopeKind::SYS>),
+ {},
+ /*isElemental=*/false},
+ {"tma_bulk_commit_group",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkCommitGroup),
+ {{}},
+ /*isElemental=*/false},
+ {"tma_bulk_g2s",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S),
+ {{{"barrier", asAddr},
+ {"src", asAddr},
+ {"dst", asAddr},
+ {"nbytes", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_ldc4",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkLoadC4),
+ {{{"barrier", asAddr},
+ {"src", asAddr},
+ {"dst", asAddr},
+ {"nelems", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_ldc8",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkLoadC8),
+ {{{"barrier", asAddr},
+ {"src", asAddr},
+ {"dst", asAddr},
+ {"nelems", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_ldi4",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkLoadI4),
+ {{{"barrier", asAddr},
+ {"src", asAddr},
+ {"dst", asAddr},
+ {"nelems", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_ldi8",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkLoadI8),
+ {{{"barrier", asAddr},
+ {"src", asAddr},
+ {"dst", asAddr},
+ {"nelems", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_ldr2",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkLoadR2),
+ {{{"barrier", asAddr},
+ {"src", asAddr},
+ {"dst", asAddr},
+ {"nelems", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_ldr4",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkLoadR4),
+ {{{"barrier", asAddr},
+ {"src", asAddr},
+ {"dst", asAddr},
+ {"nelems", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_ldr8",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkLoadR8),
+ {{{"barrier", asAddr},
+ {"src", asAddr},
+ {"dst", asAddr},
+ {"nelems", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_s2g",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G),
+ {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_store_c4",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkStoreC4),
+ {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_store_c8",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkStoreC8),
+ {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_store_i4",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkStoreI4),
+ {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_store_i8",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkStoreI8),
+ {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_store_r2",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkStoreR2),
+ {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_store_r4",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkStoreR4),
+ {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_store_r8",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkStoreR8),
+ {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+ /*isElemental=*/false},
+ {"tma_bulk_wait_group",
+ static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+ &CI::genTMABulkWaitGroup),
+ {{}},
+ /*isElemental=*/false},
+};
+
+template <std::size_t N>
+static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) {
+ // Replace by std::sorted when C++20 is default (will be constexpr).
+ const IntrinsicHandler *lastSeen{nullptr};
+ bool isSorted{true};
+ for (const auto &x : array) {
+ if (lastSeen)
+ isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name};
+ lastSeen = &x;
+ }
+ return isSorted;
+}
+static_assert(isSorted(cudaHandlers) && "map must be sorted");
+
+const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) {
+ auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) {
+ return name.compare(cudaHandler.name) > 0;
+ };
+ auto result = llvm::lower_bound(cudaHandlers, name, compare);
+ return result != std::end(cudaHandlers) && result->name == name ? result
+ : nullptr;
+}
+
+static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
+ mlir::Location loc,
+ mlir::Value barrier,
+ mlir::NVVM::NVVMMemorySpace space) {
+ mlir::Value llvmPtr = fir::ConvertOp::create(
+ builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
+ barrier);
+ mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
+ builder, loc,
+ mlir::LLVM::LLVMPointerType::get(builder.getContext(),
+ static_cast<unsigned>(space)),
+ llvmPtr);
+ return addrCast;
+}
+
+static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
+ mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
+ mlir::Value arg1) {
+ auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+ arg0 = builder.createConvert(loc, llvmPointerType, arg0);
+ return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
+ mlir::LLVM::AtomicOrdering::seq_cst);
+}
+
+// ATOMICADD
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ mlir::LLVM::AtomicBinOp binOp =
+ mlir::isa<mlir::IntegerType>(args[1].getType())
+ ? mlir::LLVM::AtomicBinOp::add
+ : mlir::LLVM::AtomicBinOp::fadd;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 2);
+
+ mlir::Value a = fir::getBase(args[0]);
+
+ if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+ a = fir::BoxAddrOp::create(builder, loc, a);
+ }
+
+ auto loc = builder.getUnknownLoc();
+ auto f16Ty = builder.getF16Type();
+ auto i32Ty = builder.getI32Type();
+ auto vecF16Ty = mlir::VectorType::get({2}, f16Ty);
+ mlir::Type idxTy = builder.getIndexType();
+ auto f16RefTy = fir::ReferenceType::get(f16Ty);
+ auto zero = builder.createIntegerConstant(loc, idxTy, 0);
+ auto one = builder.createIntegerConstant(loc, idxTy, 1);
+ auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+ fir::getBase(args[1]), zero);
+ auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+ fir::getBase(args[1]), one);
+ auto v1 = fir::LoadOp::create(builder, loc, v1Coord);
+ auto v2 = fir::LoadOp::create(builder, loc, v2Coord);
+ mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty);
+ mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
+ builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
+ mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
+ builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
+ auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
+ auto i32VecTy = mlir::VectorType::get({1}, i32Ty);
+ mlir::Value vecI32 =
+ mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res);
+ return mlir::vector::ExtractOp::create(builder, loc, vecI32,
+ mlir::ArrayRef<int64_t>{0});
+}
+
+// ATOMICADDVECTOR
+template <int extent>
+fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector(
+ mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 2);
+ mlir::Value res = fir::AllocaOp::create(
+ builder, loc, fir::SequenceType::get({extent}, resultType));
+ mlir::Value a = fir::getBase(args[0]);
+ if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+ a = fir::BoxAddrOp::create(builder, loc, a);
+ }
+ auto vecTy = mlir::VectorType::get({extent}, resultType);
+ auto refTy = fir::ReferenceType::get(resultType);
+ mlir::Type i32Ty = builder.getI32Type();
+ mlir::Type idxTy = builder.getIndexType();
+
+ // Extract the values from the array.
+ llvm::SmallVector<mlir::Value> values;
+ for (unsigned i = 0; i < extent; ++i) {
+ mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
+ mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
+ fir::getBase(args[1]), pos);
+ mlir::Value value = fir::LoadOp::create(builder, loc, coord);
+ values.push_back(value);
+ }
+ // Pack extracted values into a vector to call the atomic add.
+ mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy);
+ for (unsigned i = 0; i < extent; ++i) {
+ mlir::Value insert = mlir::LLVM::InsertElementOp::create(
+ builder, loc, undef, values[i],
+ builder.createIntegerConstant(loc, i32Ty, i));
+ undef = insert;
+ }
+ // Atomic operation with a vector of values.
+ mlir::Value add =
+ genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef);
+ // Store results in the result array.
+ for (unsigned i = 0; i < extent; ++i) {
+ mlir::Value r = mlir::LLVM::ExtractElementOp::create(
+ builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i));
+ mlir::Value c = fir::CoordinateOp::create(
+ builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i));
+ fir::StoreOp::create(builder, loc, r, c);
+ }
+ mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent);
+ return fir::ArrayBoxValue(res, {ext});
+}
+
+// ATOMICADDVECTOR4x4
+fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector4x4(
+ mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 2);
+ mlir::Value a = fir::getBase(args[0]);
+ if (mlir::isa<fir::BaseBoxType>(a.getType()))
+ a = fir::BoxAddrOp::create(builder, loc, a);
+
+ const unsigned extent = 4;
+ auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+ mlir::Value ptr = builder.createConvert(loc, llvmPtrTy, a);
+ mlir::Type f32Ty = builder.getF32Type();
+ mlir::Type idxTy = builder.getIndexType();
+ mlir::Type refTy = fir::ReferenceType::get(f32Ty);
+ llvm::SmallVector<mlir::Value> values;
+ for (unsigned i = 0; i < extent; ++i) {
+ mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
+ mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
+ fir::getBase(args[1]), pos);
+ mlir::Value value = fir::LoadOp::create(builder, loc, coord);
+ values.push_back(value);
+ }
+
+ auto inlinePtx = mlir::NVVM::InlinePtxOp::create(
+ builder, loc, {f32Ty, f32Ty, f32Ty, f32Ty},
+ {ptr, values[0], values[1], values[2], values[3]}, {},
+ "atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};", {});
+
+ llvm::SmallVector<mlir::Value> results;
+ results.push_back(inlinePtx.getResult(0));
+ results.push_back(inlinePtx.getResult(1));
+ results.push_back(inlinePtx.getResult(2));
+ results.push_back(inlinePtx.getResult(3));
+
+ mlir::Type vecF32Ty = mlir::VectorType::get({extent}, f32Ty);
+ mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF32Ty);
+ mlir::Type i32Ty = builder.getI32Type();
+ for (unsigned i = 0; i < extent; ++i)
+ undef = mlir::LLVM::InsertElementOp::create(
+ builder, loc, undef, results[i],
+ builder.createIntegerConstant(loc, i32Ty, i));
+
+ auto i128Ty = builder.getIntegerType(128);
+ auto i128VecTy = mlir::VectorType::get({1}, i128Ty);
+ mlir::Value vec128 =
+ mlir::vector::BitCastOp::create(builder, loc, i128VecTy, undef);
+ return mlir::vector::ExtractOp::create(builder, loc, vec128,
+ mlir::ArrayRef<int64_t>{0});
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+ mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+ mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICCAS
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType,
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
+ auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
+ auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
+
+ mlir::Value arg0 = fir::getBase(args[0]);
+ mlir::Value arg1 = fir::getBase(args[1]);
+ mlir::Value arg2 = fir::getBase(args[2]);
+
+ auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
+ if (mlir::isa<mlir::Float32Type>(arg.getType()))
+ return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
+ arg);
+ if (mlir::isa<mlir::Float64Type>(arg.getType()))
+ return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
+ arg);
+ return arg;
+ };
+
+ arg1 = bitCastFloat(arg1);
+ arg2 = bitCastFloat(arg2);
+
+ if (arg1.getType() != arg2.getType()) {
+ // arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
+ arg2 = builder.createConvert(loc, arg1.getType(), arg2);
+ }
+
+ auto address =
+ mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
+ .getResult(0);
+ auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
+ builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
+ mlir::Value boolResult =
+ mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+ return builder.createConvert(loc, resultType, boolResult);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+ mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICEXCH
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType,
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 2);
+ mlir::Value arg0 = fir::getBase(args[0]);
+ mlir::Value arg1 = fir::getBase(args[1]);
+ assert(arg1.getType().isIntOrFloat());
+
+ mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
+ return genAtomBinOp(builder, loc, binOp, arg0, arg1);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+ mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+
+ mlir::LLVM::AtomicBinOp binOp =
+ mlir::isa<mlir::IntegerType>(args[1].getType())
+ ? mlir::LLVM::AtomicBinOp::max
+ : mlir::LLVM::AtomicBinOp::fmax;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+
+ mlir::LLVM::AtomicBinOp binOp =
+ mlir::isa<mlir::IntegerType>(args[1].getType())
+ ? mlir::LLVM::AtomicBinOp::min
+ : mlir::LLVM::AtomicBinOp::fmin;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICSUB
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ mlir::LLVM::AtomicBinOp binOp =
+ mlir::isa<mlir::IntegerType>(args[1].getType())
+ ? mlir::LLVM::AtomicBinOp::sub
+ : mlir::LLVM::AtomicBinOp::fsub;
+ return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICXOR
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType,
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 2);
+ mlir::Value arg0 = fir::getBase(args[0]);
+ mlir::Value arg1 = fir::getBase(args[1]);
+ return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
+}
+
+// BARRIER_ARRIVE
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 1);
+ mlir::Value barrier = convertPtrToNVVMSpace(
+ builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
+ return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier)
+ .getResult(0);
+}
+
+// BARRIER_ARRIBVE_CNT
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ mlir::Value barrier = convertPtrToNVVMSpace(
+ builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
+ return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType},
+ {barrier, args[1]}, {},
+ "mbarrier.arrive.expect_tx.release."
+ "cta.shared::cta.b64 %0, [%1], %2;",
+ {})
+ .getResult(0);
+}
+
+// BARRIER_INIT
+void CUDAIntrinsicLibrary::genBarrierInit(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 2);
+ mlir::Value barrier = convertPtrToNVVMSpace(
+ builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
+ mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier,
+ fir::getBase(args[1]), {});
+ auto kind = mlir::NVVM::ProxyKindAttr::get(
+ builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+ auto space = mlir::NVVM::SharedSpaceAttr::get(
+ builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+ mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
+// BARRIER_TRY_WAIT
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+ mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
+ fir::StoreOp::create(builder, loc, zero, res);
+ mlir::Value ns =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
+ mlir::Value load = fir::LoadOp::create(builder, loc, res);
+ auto whileOp = mlir::scf::WhileOp::create(
+ builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
+ mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
+ mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
+ builder.setInsertionPointToStart(beforeBlock);
+ mlir::Value condition = mlir::arith::CmpIOp::create(
+ builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
+ mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
+ mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
+ afterBlock->addArgument(resultType, loc);
+ builder.setInsertionPointToStart(afterBlock);
+ auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+ auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+ mlir::Value ret = mlir::NVVM::InlinePtxOp::create(
+ builder, loc, {resultType}, {barrier, args[1], ns}, {},
+ "{\n"
+ " .reg .pred p;\n"
+ " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+ " selp.b32 %0, 1, 0, p;\n"
+ "}",
+ {})
+ .getResult(0);
+ mlir::scf::YieldOp::create(builder, loc, ret);
+ builder.setInsertionPointAfter(whileOp);
+ return whileOp.getResult(0);
+}
+
+// BARRIER_TRY_WAIT_SLEEP
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 3);
+ auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+ auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+ return mlir::NVVM::InlinePtxOp::create(
+ builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
+ "{\n"
+ " .reg .pred p;\n"
+ " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+ " selp.b32 %0, 1, 0, p;\n"
+ "}",
+ {})
+ .getResult(0);
+}
+
+static void insertValueAtPos(fir::FirOpBuilder &builder, mlir::Location loc,
+ fir::RecordType recTy, mlir::Value base,
+ mlir::Value dim, unsigned fieldPos) {
+ auto fieldName = recTy.getTypeList()[fieldPos].first;
+ mlir::Type fieldTy = recTy.getTypeList()[fieldPos].second;
+ mlir::Type fieldIndexType = fir::FieldType::get(base.getContext());
+ mlir::Value fieldIndex =
+ fir::FieldIndexOp::create(builder, loc, fieldIndexType, fieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value coord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(fieldTy), base, fieldIndex);
+ fir::StoreOp::create(builder, loc, dim, coord);
+}
+
+// CLUSTER_BLOCK_INDEX
+mlir::Value
+CUDAIntrinsicLibrary::genClusterBlockIndex(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 0);
+ auto recTy = mlir::cast<fir::RecordType>(resultType);
+ assert(recTy && "RecordType expepected");
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+ mlir::Type i32Ty = builder.getI32Type();
+ mlir::Value x = mlir::NVVM::BlockInClusterIdXOp::create(builder, loc, i32Ty);
+ mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+ x = mlir::arith::AddIOp::create(builder, loc, x, one);
+ insertValueAtPos(builder, loc, recTy, res, x, 0);
+ mlir::Value y = mlir::NVVM::BlockInClusterIdYOp::create(builder, loc, i32Ty);
+ y = mlir::arith::AddIOp::create(builder, loc, y, one);
+ insertValueAtPos(builder, loc, recTy, res, y, 1);
+ mlir::Value z = mlir::NVVM::BlockInClusterIdZOp::create(builder, loc, i32Ty);
+ z = mlir::arith::AddIOp::create(builder, loc, z, one);
+ insertValueAtPos(builder, loc, recTy, res, z, 2);
+ return res;
+}
+
+// CLUSTER_DIM_BLOCKS
+mlir::Value
+CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 0);
+ auto recTy = mlir::cast<fir::RecordType>(resultType);
+ assert(recTy && "RecordType expepected");
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+ mlir::Type i32Ty = builder.getI32Type();
+ mlir::Value x = mlir::NVVM::ClusterDimBlocksXOp::create(builder, loc, i32Ty);
+ insertValueAtPos(builder, loc, recTy, res, x, 0);
+ mlir::Value y = mlir::NVVM::ClusterDimBlocksYOp::create(builder, loc, i32Ty);
+ insertValueAtPos(builder, loc, recTy, res, y, 1);
+ mlir::Value z = mlir::NVVM::ClusterDimBlocksZOp::create(builder, loc, i32Ty);
+ insertValueAtPos(builder, loc, recTy, res, z, 2);
+ return res;
+}
+
+// FENCE_PROXY_ASYNC
+void CUDAIntrinsicLibrary::genFenceProxyAsync(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 0);
+ auto kind = mlir::NVVM::ProxyKindAttr::get(
+ builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+ auto space = mlir::NVVM::SharedSpaceAttr::get(
+ builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+ mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
+// __LDCA, __LDCS, __LDLU, __LDCV
+template <const char *fctName, int extent>
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType,
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 1);
+ mlir::Type resTy = fir::SequenceType::get(extent, resultType);
+ mlir::Value arg = fir::getBase(args[0]);
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
+ if (mlir::isa<fir::BaseBoxType>(arg.getType()))
+ arg = fir::BoxAddrOp::create(builder, loc, arg);
+ mlir::Type refResTy = fir::ReferenceType::get(resTy);
+ mlir::FunctionType ftype =
+ mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
+ auto funcOp = builder.createFunction(loc, fctName, ftype);
+ llvm::SmallVector<mlir::Value> funcArgs;
+ funcArgs.push_back(res);
+ funcArgs.push_back(arg);
+ fir::CallOp::create(builder, loc, funcOp, funcArgs);
+ mlir::Value ext =
+ builder.createIntegerConstant(loc, builder.getIndexType(), extent);
+ return fir::ArrayBoxValue(res, {ext});
+}
+
+// CLOCK, CLOCK64, GLOBALTIMER
+template <typename OpTy>
+mlir::Value
+CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 0 && "expect no arguments");
+ return OpTy::create(builder, loc, resultType).getResult();
+}
+
+// MATCH_ALL_SYNC
+mlir::Value
+CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 3);
+ bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+ mlir::Type i1Ty = builder.getI1Type();
+ mlir::MLIRContext *context = builder.getContext();
+
+ mlir::Value arg1 = args[1];
+ if (arg1.getType().isF32() || arg1.getType().isF64())
+ arg1 = fir::ConvertOp::create(
+ builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+
+ mlir::Type retTy =
+ mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
+ auto match =
+ mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
+ mlir::NVVM::MatchSyncKind::all)
+ .getResult();
+ auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
+ auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
+ auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
+ fir::StoreOp::create(builder, loc, conv, args[2]);
+ return value;
+}
+
+// MATCH_ANY_SYNC
+mlir::Value
+CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+ mlir::Value arg1 = args[1];
+ if (arg1.getType().isF32() || arg1.getType().isF64())
+ arg1 = fir::ConvertOp::create(
+ builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+
+ return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
+ arg1, mlir::NVVM::MatchSyncKind::any)
+ .getResult();
+}
+
+// SYNCTHREADS
+void CUDAIntrinsicLibrary::genSyncThreads(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ mlir::NVVM::Barrier0Op::create(builder, loc);
+}
+
+// SYNCTHREADS_AND
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
+ return mlir::NVVM::BarrierOp::create(
+ builder, loc, resultType, {}, {},
+ mlir::NVVM::BarrierReductionAttr::get(
+ builder.getContext(), mlir::NVVM::BarrierReduction::AND),
+ arg)
+ .getResult(0);
+}
+
+// SYNCTHREADS_COUNT
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
+ return mlir::NVVM::BarrierOp::create(
+ builder, loc, resultType, {}, {},
+ mlir::NVVM::BarrierReductionAttr::get(
+ builder.getContext(), mlir::NVVM::BarrierReduction::POPC),
+ arg)
+ .getResult(0);
+}
+
+// SYNCTHREADS_OR
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
+ return mlir::NVVM::BarrierOp::create(
+ builder, loc, resultType, {}, {},
+ mlir::NVVM::BarrierReductionAttr::get(
+ builder.getContext(), mlir::NVVM::BarrierReduction::OR),
+ arg)
+ .getResult(0);
+}
+
+// SYNCWARP
+void CUDAIntrinsicLibrary::genSyncWarp(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 1);
+ mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0]));
+}
+
+// THIS_CLUSTER
+mlir::Value
+CUDAIntrinsicLibrary::genThisCluster(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 0);
+ auto recTy = mlir::cast<fir::RecordType>(resultType);
+ assert(recTy && "RecordType expepected");
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+ mlir::Type i32Ty = builder.getI32Type();
+
+ // SIZE
+ mlir::Value size = mlir::NVVM::ClusterDim::create(builder, loc, i32Ty);
+ auto sizeFieldName = recTy.getTypeList()[1].first;
+ mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+ mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+ mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, sizeFieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value sizeCoord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+ fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+ // RANK
+ mlir::Value rank = mlir::NVVM::ClusterId::create(builder, loc, i32Ty);
+ mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+ rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+ auto rankFieldName = recTy.getTypeList()[2].first;
+ mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+ mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, rankFieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value rankCoord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+ fir::StoreOp::create(builder, loc, rank, rankCoord);
+
+ return res;
+}
+
+// THIS_GRID
+mlir::Value
+CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 0);
+ auto recTy = mlir::cast<fir::RecordType>(resultType);
+ assert(recTy && "RecordType expepected");
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+ mlir::Type i32Ty = builder.getI32Type();
+
+ mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+ mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+ mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
+
+ mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
+ mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
+ mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
+
+ mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+ mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+ mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+ mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
+ mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
+ mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
+
+ // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
+ // (blockDim.x * gridDim.x);
+ mlir::Value resZ =
+ mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
+ mlir::Value resY =
+ mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
+ mlir::Value resX =
+ mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
+ mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
+ mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
+
+ // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
+ // blockIdx.x;
+ // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
+ // ((threadIdx.z * blockDim.y) * blockDim.x) +
+ // (threadIdx.y * blockDim.x) + threadIdx.x + 1;
+ mlir::Value r1 =
+ mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
+ mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
+ mlir::Value r3 =
+ mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
+ mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+ mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
+
+ mlir::Value bXbY =
+ mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
+ mlir::Value bXbYbZ =
+ mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
+ mlir::Value tZbY =
+ mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+ mlir::Value tZbYbX =
+ mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
+ mlir::Value tYbX =
+ mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+ mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
+ rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
+ rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
+ rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
+ mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+ rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+
+ auto sizeFieldName = recTy.getTypeList()[1].first;
+ mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+ mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+ mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, sizeFieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value sizeCoord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+ fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+ auto rankFieldName = recTy.getTypeList()[2].first;
+ mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+ mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, rankFieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value rankCoord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+ fir::StoreOp::create(builder, loc, rank, rankCoord);
+ return res;
+}
+
+// THIS_THREAD_BLOCK
+mlir::Value
+CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 0);
+ auto recTy = mlir::cast<fir::RecordType>(resultType);
+ assert(recTy && "RecordType expepected");
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+ mlir::Type i32Ty = builder.getI32Type();
+
+ // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
+ mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+ mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+ mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+ mlir::Value size =
+ mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
+ size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
+
+ // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
+ // (threadIdx.y * blockDim.x) + threadIdx.x + 1;
+ mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+ mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+ mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
+ mlir::Value r1 =
+ mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+ mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
+ mlir::Value r3 =
+ mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+ mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+ mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
+ mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+ rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+
+ auto sizeFieldName = recTy.getTypeList()[1].first;
+ mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+ mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+ mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, sizeFieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value sizeCoord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+ fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+ auto rankFieldName = recTy.getTypeList()[2].first;
+ mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+ mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, rankFieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value rankCoord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+ fir::StoreOp::create(builder, loc, rank, rankCoord);
+ return res;
+}
+
+// THIS_WARP
+mlir::Value
+CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 0);
+ auto recTy = mlir::cast<fir::RecordType>(resultType);
+ assert(recTy && "RecordType expepected");
+ mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+ mlir::Type i32Ty = builder.getI32Type();
+
+ // coalesced_group%size = 32
+ mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
+ auto sizeFieldName = recTy.getTypeList()[1].first;
+ mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+ mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+ mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, sizeFieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value sizeCoord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+ fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+ // coalesced_group%rank = threadIdx.x & 31 + 1
+ mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+ mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
+ mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+ mlir::Value masked =
+ mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
+ mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
+ auto rankFieldName = recTy.getTypeList()[2].first;
+ mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+ mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+ builder, loc, fieldIndexType, rankFieldName, recTy,
+ /*typeParams=*/mlir::ValueRange{});
+ mlir::Value rankCoord = fir::CoordinateOp::create(
+ builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+ fir::StoreOp::create(builder, loc, rank, rankCoord);
+ return res;
+}
+
+// THREADFENCE, THREADFENCE_BLOCK, THREADFENCE_SYSTEM
+template <mlir::NVVM::MemScopeKind scope>
+void CUDAIntrinsicLibrary::genThreadFence(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 0);
+ mlir::NVVM::MembarOp::create(builder, loc, scope);
+}
+
+// TMA_BULK_COMMIT_GROUP
+void CUDAIntrinsicLibrary::genTMABulkCommitGroup(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 0);
+ mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
+}
+
+// TMA_BULK_G2S
+void CUDAIntrinsicLibrary::genTMABulkG2S(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 4);
+ mlir::Value barrier = convertPtrToNVVMSpace(
+ builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
+ mlir::Value dst =
+ convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
+ mlir::NVVM::NVVMMemorySpace::SharedCluster);
+ mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+ mlir::NVVM::NVVMMemorySpace::Global);
+ mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
+ builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
+}
+
+static void setAlignment(mlir::Value ptr, unsigned alignment) {
+ if (auto declareOp = mlir::dyn_cast<hlfir::DeclareOp>(ptr.getDefiningOp()))
+ if (auto sharedOp = mlir::dyn_cast<cuf::SharedMemoryOp>(
+ declareOp.getMemref().getDefiningOp()))
+ sharedOp.setAlignment(alignment);
+}
+
+static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value barrier, mlir::Value src,
+ mlir::Value dst, mlir::Value nelem,
+ mlir::Value eleSize) {
+ mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
+ auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+ barrier = builder.createConvert(loc, llvmPtrTy, barrier);
+ setAlignment(dst, kTMAAlignment);
+ dst = builder.createConvert(loc, llvmPtrTy, dst);
+ src = builder.createConvert(loc, llvmPtrTy, src);
+ mlir::NVVM::InlinePtxOp::create(
+ builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
+ "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
+ "[%1], %2, [%3];",
+ {});
+ mlir::NVVM::InlinePtxOp::create(
+ builder, loc, mlir::TypeRange{}, {barrier, size}, {},
+ "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
+}
+
+// TMA_BULK_LOADC4
+void CUDAIntrinsicLibrary::genTMABulkLoadC4(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 4);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+ genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADC8
+void CUDAIntrinsicLibrary::genTMABulkLoadC8(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 4);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+ genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI4
+void CUDAIntrinsicLibrary::genTMABulkLoadI4(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 4);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+ genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI8
+void CUDAIntrinsicLibrary::genTMABulkLoadI8(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 4);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+ genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR2
+void CUDAIntrinsicLibrary::genTMABulkLoadR2(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 4);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+ genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR4
+void CUDAIntrinsicLibrary::genTMABulkLoadR4(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 4);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+ genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR8
+void CUDAIntrinsicLibrary::genTMABulkLoadR8(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 4);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+ genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_S2G
+void CUDAIntrinsicLibrary::genTMABulkS2G(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
+ mlir::NVVM::NVVMMemorySpace::Shared);
+ mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+ mlir::NVVM::NVVMMemorySpace::Global);
+ mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
+ builder, loc, dst, src, fir::getBase(args[2]), {}, {});
+
+ mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+ "cp.async.bulk.commit_group;", {});
+ mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+ builder.getI32IntegerAttr(0), {});
+}
+
+static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value src, mlir::Value dst, mlir::Value count,
+ mlir::Value eleSize) {
+ mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
+ setAlignment(src, kTMAAlignment);
+ src = convertPtrToNVVMSpace(builder, loc, src,
+ mlir::NVVM::NVVMMemorySpace::Shared);
+ dst = convertPtrToNVVMSpace(builder, loc, dst,
+ mlir::NVVM::NVVMMemorySpace::Global);
+ mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
+ size, {}, {});
+ mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+ "cp.async.bulk.commit_group;", {});
+ mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+ builder.getI32IntegerAttr(0), {});
+}
+
+// TMA_BULK_STORE_C4
+void CUDAIntrinsicLibrary::genTMABulkStoreC4(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+ genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_C8
+void CUDAIntrinsicLibrary::genTMABulkStoreC8(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+ genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I4
+void CUDAIntrinsicLibrary::genTMABulkStoreI4(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+ genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I8
+void CUDAIntrinsicLibrary::genTMABulkStoreI8(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+ genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R2
+void CUDAIntrinsicLibrary::genTMABulkStoreR2(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+ genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R4
+void CUDAIntrinsicLibrary::genTMABulkStoreR4(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+ genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R8
+void CUDAIntrinsicLibrary::genTMABulkStoreR8(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 3);
+ mlir::Value eleSize =
+ builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+ genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+ fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_WAIT_GROUP
+void CUDAIntrinsicLibrary::genTMABulkWaitGroup(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 0);
+ auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
+ mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
+}
+
+// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
+template <mlir::NVVM::VoteSyncKind kind>
+mlir::Value
+CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ assert(args.size() == 2);
+ mlir::Value arg1 =
+ fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
+ mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
+ ? builder.getI32Type()
+ : builder.getI1Type();
+ auto voteRes =
+ mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
+ .getResult();
+ return fir::ConvertOp::create(builder, loc, resultType, voteRes);
+}
+
+} // namespace fir
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index cf7588f..2266f4d 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -9,6 +9,7 @@
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/Support/KindMapping.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
@@ -91,3 +92,66 @@ void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) {
}
}
}
+
+int cuf::computeElementByteSize(mlir::Location loc, mlir::Type type,
+ fir::KindMapping &kindMap,
+ bool emitErrorOnFailure) {
+ auto eleTy = fir::unwrapSequenceType(type);
+ if (auto t{mlir::dyn_cast<mlir::IntegerType>(eleTy)})
+ return t.getWidth() / 8;
+ if (auto t{mlir::dyn_cast<mlir::FloatType>(eleTy)})
+ return t.getWidth() / 8;
+ if (auto t{mlir::dyn_cast<fir::LogicalType>(eleTy)})
+ return kindMap.getLogicalBitsize(t.getFKind()) / 8;
+ if (auto t{mlir::dyn_cast<mlir::ComplexType>(eleTy)}) {
+ int elemSize =
+ mlir::cast<mlir::FloatType>(t.getElementType()).getWidth() / 8;
+ return 2 * elemSize;
+ }
+ if (auto t{mlir::dyn_cast<fir::CharacterType>(eleTy)})
+ return kindMap.getCharacterBitsize(t.getFKind()) / 8;
+ if (emitErrorOnFailure)
+ mlir::emitError(loc, "unsupported type");
+ return 0;
+}
+
+mlir::Value cuf::computeElementCount(mlir::PatternRewriter &rewriter,
+ mlir::Location loc,
+ mlir::Value shapeOperand,
+ mlir::Type seqType,
+ mlir::Type targetType) {
+ if (shapeOperand) {
+ // Dynamic extent - extract from shape operand
+ llvm::SmallVector<mlir::Value> extents;
+ if (auto shapeOp =
+ mlir::dyn_cast<fir::ShapeOp>(shapeOperand.getDefiningOp())) {
+ extents = shapeOp.getExtents();
+ } else if (auto shapeShiftOp = mlir::dyn_cast<fir::ShapeShiftOp>(
+ shapeOperand.getDefiningOp())) {
+ for (auto i : llvm::enumerate(shapeShiftOp.getPairs()))
+ if (i.index() & 1)
+ extents.push_back(i.value());
+ }
+
+ if (extents.empty())
+ return mlir::Value();
+
+ // Compute total element count by multiplying all dimensions
+ mlir::Value count =
+ fir::ConvertOp::create(rewriter, loc, targetType, extents[0]);
+ for (unsigned i = 1; i < extents.size(); ++i) {
+ auto operand =
+ fir::ConvertOp::create(rewriter, loc, targetType, extents[i]);
+ count = mlir::arith::MulIOp::create(rewriter, loc, count, operand);
+ }
+ return count;
+ } else {
+ // Static extent - use constant array size
+ if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(seqType)) {
+ mlir::IntegerAttr attr =
+ rewriter.getIntegerAttr(targetType, seqTy.getConstantArraySize());
+ return mlir::arith::ConstantOp::create(rewriter, loc, targetType, attr);
+ }
+ }
+ return mlir::Value();
+}
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 5da27d1..c704ac7 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -427,7 +427,8 @@ mlir::Value fir::FirOpBuilder::genTempDeclareOp(
builder, loc, memref.getType(), memref, shape, typeParams,
/*dummy_scope=*/nullptr,
/*storage=*/nullptr,
- /*storage_offset=*/0, nameAttr, fortranAttrs, cuf::DataAttributeAttr{});
+ /*storage_offset=*/0, nameAttr, fortranAttrs, cuf::DataAttributeAttr{},
+ /*dummy_arg_no=*/mlir::IntegerAttr{});
}
mlir::Value fir::FirOpBuilder::genStackSave(mlir::Location loc) {
@@ -1392,12 +1393,10 @@ fir::ExtendedValue fir::factory::arraySectionElementToExtendedValue(
return fir::factory::componentToExtendedValue(builder, loc, element);
}
-void fir::factory::genScalarAssignment(fir::FirOpBuilder &builder,
- mlir::Location loc,
- const fir::ExtendedValue &lhs,
- const fir::ExtendedValue &rhs,
- bool needFinalization,
- bool isTemporaryLHS) {
+void fir::factory::genScalarAssignment(
+ fir::FirOpBuilder &builder, mlir::Location loc,
+ const fir::ExtendedValue &lhs, const fir::ExtendedValue &rhs,
+ bool needFinalization, bool isTemporaryLHS, mlir::ArrayAttr accessGroups) {
assert(lhs.rank() == 0 && rhs.rank() == 0 && "must be scalars");
auto type = fir::unwrapSequenceType(
fir::unwrapPassByRefType(fir::getBase(lhs).getType()));
@@ -1419,7 +1418,9 @@ void fir::factory::genScalarAssignment(fir::FirOpBuilder &builder,
mlir::Value lhsAddr = fir::getBase(lhs);
rhsVal = builder.createConvert(loc, fir::unwrapRefType(lhsAddr.getType()),
rhsVal);
- fir::StoreOp::create(builder, loc, rhsVal, lhsAddr);
+ fir::StoreOp store = fir::StoreOp::create(builder, loc, rhsVal, lhsAddr);
+ if (accessGroups)
+ store.setAccessGroupsAttr(accessGroups);
}
}
@@ -1670,6 +1671,26 @@ mlir::Value fir::factory::createZeroValue(fir::FirOpBuilder &builder,
"numeric or logical type");
}
+mlir::Value fir::factory::createOneValue(fir::FirOpBuilder &builder,
+ mlir::Location loc, mlir::Type type) {
+ mlir::Type i1 = builder.getIntegerType(1);
+ if (mlir::isa<fir::LogicalType>(type) || type == i1)
+ return builder.createConvert(loc, type, builder.createBool(loc, true));
+ if (fir::isa_integer(type))
+ return builder.createIntegerConstant(loc, type, 1);
+ if (fir::isa_real(type))
+ return builder.createRealOneConstant(loc, type);
+ if (fir::isa_complex(type)) {
+ fir::factory::Complex complexHelper(builder, loc);
+ mlir::Type partType = complexHelper.getComplexPartType(type);
+ mlir::Value realPart = builder.createRealOneConstant(loc, partType);
+ mlir::Value imagPart = builder.createRealZeroConstant(loc, partType);
+ return complexHelper.createComplex(type, realPart, imagPart);
+ }
+ fir::emitFatalError(loc, "internal: trying to generate one value of non "
+ "numeric or logical type");
+}
+
std::optional<std::int64_t>
fir::factory::getExtentFromTriplet(mlir::Value lb, mlir::Value ub,
mlir::Value stride) {
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 93dfc57..a345dcb 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -250,7 +250,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
const fir::ExtendedValue &exv, llvm::StringRef name,
fir::FortranVariableFlagsAttr flags, mlir::Value dummyScope,
mlir::Value storage, std::uint64_t storageOffset,
- cuf::DataAttributeAttr dataAttr) {
+ cuf::DataAttributeAttr dataAttr, unsigned dummyArgNo) {
mlir::Value base = fir::getBase(exv);
assert(fir::conformsWithPassByRef(base.getType()) &&
@@ -281,7 +281,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
[](const auto &) {});
auto declareOp = hlfir::DeclareOp::create(
builder, loc, base, name, shapeOrShift, lenParams, dummyScope, storage,
- storageOffset, flags, dataAttr);
+ storageOffset, flags, dataAttr, dummyArgNo);
return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
}
@@ -402,9 +402,9 @@ hlfir::Entity hlfir::genVariableBox(mlir::Location loc,
fir::BoxType::get(var.getElementOrSequenceType(), isVolatile);
if (forceBoxType) {
boxType = forceBoxType;
- mlir::Type baseType =
- fir::ReferenceType::get(fir::unwrapRefType(forceBoxType.getEleTy()));
- addr = builder.createConvert(loc, baseType, addr);
+ mlir::Type baseType = fir::ReferenceType::get(
+ fir::unwrapRefType(forceBoxType.getEleTy()), forceBoxType.isVolatile());
+ addr = builder.createConvertWithVolatileCast(loc, baseType, addr);
}
auto embox = fir::EmboxOp::create(builder, loc, boxType, addr, shape,
/*slice=*/mlir::Value{}, typeParams);
@@ -1392,6 +1392,66 @@ bool hlfir::elementalOpMustProduceTemp(hlfir::ElementalOp elemental) {
return false;
}
+static void combineAndStoreElement(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity lhs,
+ hlfir::Entity rhs, bool temporaryLHS,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)> *combiner) {
+ hlfir::Entity valueToAssign = hlfir::loadTrivialScalar(loc, builder, rhs);
+ if (combiner) {
+ hlfir::Entity lhsValue = hlfir::loadTrivialScalar(loc, builder, lhs);
+ valueToAssign = (*combiner)(loc, builder, lhsValue, valueToAssign);
+ }
+ hlfir::AssignOp::create(builder, loc, valueToAssign, lhs,
+ /*realloc=*/false,
+ /*keep_lhs_length_if_realloc=*/false,
+ /*temporary_lhs=*/temporaryLHS);
+}
+
+void hlfir::genNoAliasArrayAssignment(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+ hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)> *combiner) {
+ mlir::OpBuilder::InsertionGuard guard(builder);
+ rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+ lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+ mlir::Value lhsShape = hlfir::genShape(loc, builder, lhs);
+ llvm::SmallVector<mlir::Value> lhsExtents =
+ hlfir::getIndexExtents(loc, builder, lhsShape);
+ mlir::Value rhsShape = hlfir::genShape(loc, builder, rhs);
+ llvm::SmallVector<mlir::Value> rhsExtents =
+ hlfir::getIndexExtents(loc, builder, rhsShape);
+ llvm::SmallVector<mlir::Value> extents =
+ fir::factory::deduceOptimalExtents(lhsExtents, rhsExtents);
+ hlfir::LoopNest loopNest =
+ hlfir::genLoopNest(loc, builder, extents,
+ /*isUnordered=*/true, emitWorkshareLoop);
+ builder.setInsertionPointToStart(loopNest.body);
+ auto rhsArrayElement =
+ hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
+ rhsArrayElement = hlfir::loadTrivialScalar(loc, builder, rhsArrayElement);
+ auto lhsArrayElement =
+ hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
+ combineAndStoreElement(loc, builder, lhsArrayElement, rhsArrayElement,
+ temporaryLHS, combiner);
+}
+
+void hlfir::genNoAliasAssignment(
+ mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity rhs,
+ hlfir::Entity lhs, bool emitWorkshareLoop, bool temporaryLHS,
+ std::function<hlfir::Entity(mlir::Location, fir::FirOpBuilder &,
+ hlfir::Entity, hlfir::Entity)> *combiner) {
+ if (lhs.isArray()) {
+ genNoAliasArrayAssignment(loc, builder, rhs, lhs, emitWorkshareLoop,
+ temporaryLHS, combiner);
+ return;
+ }
+ rhs = hlfir::derefPointersAndAllocatables(loc, builder, rhs);
+ lhs = hlfir::derefPointersAndAllocatables(loc, builder, lhs);
+ combineAndStoreElement(loc, builder, lhs, rhs, temporaryLHS, combiner);
+}
+
std::pair<hlfir::Entity, bool>
hlfir::createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
hlfir::Entity mold) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ca3e1cd..75a74ee 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -16,6 +16,7 @@
#include "flang/Optimizer/Builder/IntrinsicCall.h"
#include "flang/Common/static-multimap-view.h"
#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Character.h"
#include "flang/Optimizer/Builder/Complex.h"
@@ -50,7 +51,6 @@
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -91,6 +91,11 @@ static bool isStaticallyAbsent(llvm::ArrayRef<mlir::Value> args,
size_t argIndex) {
return args.size() <= argIndex || !args[argIndex];
}
+static bool isOptional(mlir::Value value) {
+ auto varIface = mlir::dyn_cast_or_null<fir::FortranVariableOpInterface>(
+ value.getDefiningOp());
+ return varIface && varIface.isOptional();
+}
/// Test if an ExtendedValue is present. This is used to test if an intrinsic
/// argument is present at compile time. This does not imply that the related
@@ -108,34 +113,6 @@ using I = IntrinsicLibrary;
/// argument is an optional variable in the current scope).
static constexpr bool handleDynamicOptional = true;
-/// TODO: Move all CUDA Fortran intrinsic handlers into its own file similar to
-/// PPC.
-static const char __ldca_i4x4[] = "__ldca_i4x4_";
-static const char __ldca_i8x2[] = "__ldca_i8x2_";
-static const char __ldca_r2x2[] = "__ldca_r2x2_";
-static const char __ldca_r4x4[] = "__ldca_r4x4_";
-static const char __ldca_r8x2[] = "__ldca_r8x2_";
-static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
-static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
-static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
-static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
-static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
-static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
-static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
-static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
-static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
-static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
-static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
-static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
-static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
-static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
-static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
-static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
-static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
-static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
-static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
-static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
-
/// Table that drives the fir generation depending on the intrinsic or intrinsic
/// module procedure one to one mapping with Fortran arguments. If no mapping is
/// defined here for a generic intrinsic, genRuntimeCall will be called
@@ -144,106 +121,6 @@ static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
/// argument must not be lowered by value. In which case, the lowering rules
/// should be provided for all the intrinsic arguments for completeness.
static constexpr IntrinsicHandler handlers[]{
- {"__ldca_i4x4",
- &I::genCUDALDXXFunc<__ldca_i4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldca_i8x2",
- &I::genCUDALDXXFunc<__ldca_i8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldca_r2x2",
- &I::genCUDALDXXFunc<__ldca_r2x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldca_r4x4",
- &I::genCUDALDXXFunc<__ldca_r4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldca_r8x2",
- &I::genCUDALDXXFunc<__ldca_r8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcg_i4x4",
- &I::genCUDALDXXFunc<__ldcg_i4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcg_i8x2",
- &I::genCUDALDXXFunc<__ldcg_i8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcg_r2x2",
- &I::genCUDALDXXFunc<__ldcg_r2x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcg_r4x4",
- &I::genCUDALDXXFunc<__ldcg_r4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcg_r8x2",
- &I::genCUDALDXXFunc<__ldcg_r8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcs_i4x4",
- &I::genCUDALDXXFunc<__ldcs_i4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcs_i8x2",
- &I::genCUDALDXXFunc<__ldcs_i8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcs_r2x2",
- &I::genCUDALDXXFunc<__ldcs_r2x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcs_r4x4",
- &I::genCUDALDXXFunc<__ldcs_r4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcs_r8x2",
- &I::genCUDALDXXFunc<__ldcs_r8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcv_i4x4",
- &I::genCUDALDXXFunc<__ldcv_i4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcv_i8x2",
- &I::genCUDALDXXFunc<__ldcv_i8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcv_r2x2",
- &I::genCUDALDXXFunc<__ldcv_r2x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcv_r4x4",
- &I::genCUDALDXXFunc<__ldcv_r4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldcv_r8x2",
- &I::genCUDALDXXFunc<__ldcv_r8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldlu_i4x4",
- &I::genCUDALDXXFunc<__ldlu_i4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldlu_i8x2",
- &I::genCUDALDXXFunc<__ldlu_i8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldlu_r2x2",
- &I::genCUDALDXXFunc<__ldlu_r2x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldlu_r4x4",
- &I::genCUDALDXXFunc<__ldlu_r4x4, 4>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
- {"__ldlu_r8x2",
- &I::genCUDALDXXFunc<__ldlu_r8x2, 2>,
- {{{"a", asAddr}}},
- /*isElemental=*/false},
{"abort", &I::genAbort},
{"abs", &I::genAbs},
{"achar", &I::genChar},
@@ -263,10 +140,6 @@ static constexpr IntrinsicHandler handlers[]{
&I::genAll,
{{{"mask", asAddr}, {"dim", asValue}}},
/*isElemental=*/false},
- {"all_sync",
- &I::genVoteSync<mlir::NVVM::VoteSyncKind::all>,
- {{{"mask", asValue}, {"pred", asValue}}},
- /*isElemental=*/false},
{"allocated",
&I::genAllocated,
{{{"array", asInquired}, {"scalar", asInquired}}},
@@ -276,10 +149,6 @@ static constexpr IntrinsicHandler handlers[]{
&I::genAny,
{{{"mask", asAddr}, {"dim", asValue}}},
/*isElemental=*/false},
- {"any_sync",
- &I::genVoteSync<mlir::NVVM::VoteSyncKind::any>,
- {{{"mask", asValue}, {"pred", asValue}}},
- /*isElemental=*/false},
{"asind", &I::genAsind},
{"asinpi", &I::genAsinpi},
{"associated",
@@ -290,83 +159,6 @@ static constexpr IntrinsicHandler handlers[]{
{"atan2pi", &I::genAtanpi},
{"atand", &I::genAtand},
{"atanpi", &I::genAtanpi},
- {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomiccasd",
- &I::genAtomicCas,
- {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
- false},
- {"atomiccasf",
- &I::genAtomicCas,
- {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
- false},
- {"atomiccasi",
- &I::genAtomicCas,
- {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
- false},
- {"atomiccasul",
- &I::genAtomicCas,
- {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
- false},
- {"atomicdeci", &I::genAtomicDec, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicexchd",
- &I::genAtomicExch,
- {{{"a", asAddr}, {"v", asValue}}},
- false},
- {"atomicexchf",
- &I::genAtomicExch,
- {{{"a", asAddr}, {"v", asValue}}},
- false},
- {"atomicexchi",
- &I::genAtomicExch,
- {{{"a", asAddr}, {"v", asValue}}},
- false},
- {"atomicexchul",
- &I::genAtomicExch,
- {{{"a", asAddr}, {"v", asValue}}},
- false},
- {"atomicinci", &I::genAtomicInc, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicmaxd", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicmaxf", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicmaxi", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicmaxl", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicmind", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicminf", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicmini", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicminl", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicori", &I::genAtomicOr, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicsubd", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicsubf", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false},
- {"ballot_sync",
- &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>,
- {{{"mask", asValue}, {"pred", asValue}}},
- /*isElemental=*/false},
- {"barrier_arrive",
- &I::genBarrierArrive,
- {{{"barrier", asAddr}}},
- /*isElemental=*/false},
- {"barrier_arrive_cnt",
- &I::genBarrierArriveCnt,
- {{{"barrier", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"barrier_init",
- &I::genBarrierInit,
- {{{"barrier", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"barrier_try_wait",
- &I::genBarrierTryWait,
- {{{"barrier", asAddr}, {"token", asValue}}},
- /*isElemental=*/false},
- {"barrier_try_wait_sleep",
- &I::genBarrierTryWaitSleep,
- {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
- /*isElemental=*/false},
{"bessel_jn",
&I::genBesselJn,
{{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -410,11 +202,6 @@ static constexpr IntrinsicHandler handlers[]{
&I::genChdir,
{{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}},
/*isElemental=*/false},
- {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false},
- {"clock64",
- &I::genNVVMTime<mlir::NVVM::Clock64Op>,
- {},
- /*isElemental=*/false},
{"cmplx",
&I::genCmplx,
{{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}},
@@ -511,10 +298,6 @@ static constexpr IntrinsicHandler handlers[]{
&I::genExtendsTypeOf,
{{{"a", asBox}, {"mold", asBox}}},
/*isElemental=*/false},
- {"fence_proxy_async",
- &I::genFenceProxyAsync,
- {},
- /*isElemental=*/false},
{"findloc",
&I::genFindloc,
{{{"array", asBox},
@@ -525,6 +308,10 @@ static constexpr IntrinsicHandler handlers[]{
{"back", asValue, handleDynamicOptional}}},
/*isElemental=*/false},
{"floor", &I::genFloor},
+ {"flush",
+ &I::genFlush,
+ {{{"unit", asAddr}}},
+ /*isElemental=*/false},
{"fraction", &I::genFraction},
{"free", &I::genFree},
{"fseek",
@@ -562,6 +349,10 @@ static constexpr IntrinsicHandler handlers[]{
{"trim_name", asAddr, handleDynamicOptional},
{"errmsg", asBox, handleDynamicOptional}}},
/*isElemental=*/false},
+ {"get_team",
+ &I::genGetTeam,
+ {{{"level", asValue, handleDynamicOptional}}},
+ /*isElemental=*/false},
{"getcwd",
&I::genGetCwd,
{{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -569,10 +360,6 @@ static constexpr IntrinsicHandler handlers[]{
{"getgid", &I::genGetGID},
{"getpid", &I::genGetPID},
{"getuid", &I::genGetUID},
- {"globaltimer",
- &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>,
- {},
- /*isElemental=*/false},
{"hostnm",
&I::genHostnm,
{{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -712,6 +499,10 @@ static constexpr IntrinsicHandler handlers[]{
{"dim", asValue},
{"mask", asBox, handleDynamicOptional}}},
/*isElemental=*/false},
+ {"irand",
+ &I::genIrand,
+ {{{"i", asAddr, handleDynamicOptional}}},
+ /*isElemental=*/false},
{"is_contiguous",
&I::genIsContiguous,
{{{"array", asBox}}},
@@ -740,38 +531,6 @@ static constexpr IntrinsicHandler handlers[]{
{"malloc", &I::genMalloc},
{"maskl", &I::genMask<mlir::arith::ShLIOp>},
{"maskr", &I::genMask<mlir::arith::ShRUIOp>},
- {"match_all_syncjd",
- &I::genMatchAllSync,
- {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
- /*isElemental=*/false},
- {"match_all_syncjf",
- &I::genMatchAllSync,
- {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
- /*isElemental=*/false},
- {"match_all_syncjj",
- &I::genMatchAllSync,
- {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
- /*isElemental=*/false},
- {"match_all_syncjx",
- &I::genMatchAllSync,
- {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
- /*isElemental=*/false},
- {"match_any_syncjd",
- &I::genMatchAnySync,
- {{{"mask", asValue}, {"value", asValue}}},
- /*isElemental=*/false},
- {"match_any_syncjf",
- &I::genMatchAnySync,
- {{{"mask", asValue}, {"value", asValue}}},
- /*isElemental=*/false},
- {"match_any_syncjj",
- &I::genMatchAnySync,
- {{{"mask", asValue}, {"value", asValue}}},
- /*isElemental=*/false},
- {"match_any_syncjx",
- &I::genMatchAnySync,
- {{{"mask", asValue}, {"value", asValue}}},
- /*isElemental=*/false},
{"matmul",
&I::genMatmul,
{{{"matrix_a", asAddr}, {"matrix_b", asAddr}}},
@@ -870,6 +629,10 @@ static constexpr IntrinsicHandler handlers[]{
&I::genPutenv,
{{{"str", asAddr}, {"status", asAddr, handleDynamicOptional}}},
/*isElemental=*/false},
+ {"rand",
+ &I::genRand,
+ {{{"i", asAddr, handleDynamicOptional}}},
+ /*isElemental=*/false},
{"random_init",
&I::genRandomInit,
{{{"repeatable", asValue}, {"image_distinct", asValue}}},
@@ -964,6 +727,10 @@ static constexpr IntrinsicHandler handlers[]{
{"shifta", &I::genShiftA},
{"shiftl", &I::genShift<mlir::arith::ShLIOp>},
{"shiftr", &I::genShift<mlir::arith::ShRUIOp>},
+ {"show_descriptor",
+ &I::genShowDescriptor,
+ {{{"d", asBox}}},
+ /*isElemental=*/false},
{"sign", &I::genSign},
{"signal",
&I::genSignalSubroutine,
@@ -997,20 +764,6 @@ static constexpr IntrinsicHandler handlers[]{
{"dim", asValue},
{"mask", asBox, handleDynamicOptional}}},
/*isElemental=*/false},
- {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false},
- {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
- {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
- {"syncthreads_count_i4",
- &I::genSyncThreadsCount,
- {},
- /*isElemental=*/false},
- {"syncthreads_count_l4",
- &I::genSyncThreadsCount,
- {},
- /*isElemental=*/false},
- {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
- {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
- {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false},
{"system",
&I::genSystem,
{{{"command", asBox}, {"exitstat", asBox, handleDynamicOptional}}},
@@ -1021,115 +774,17 @@ static constexpr IntrinsicHandler handlers[]{
/*isElemental=*/false},
{"tand", &I::genTand},
{"tanpi", &I::genTanpi},
- {"this_grid", &I::genThisGrid, {}, /*isElemental=*/false},
+ {"team_number",
+ &I::genTeamNumber,
+ {{{"team", asBox, handleDynamicOptional}}},
+ /*isElemental=*/false},
{"this_image",
&I::genThisImage,
{{{"coarray", asBox},
{"dim", asAddr},
{"team", asBox, handleDynamicOptional}}},
/*isElemental=*/false},
- {"this_thread_block", &I::genThisThreadBlock, {}, /*isElemental=*/false},
- {"this_warp", &I::genThisWarp, {}, /*isElemental=*/false},
- {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false},
- {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false},
- {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false},
{"time", &I::genTime, {}, /*isElemental=*/false},
- {"tma_bulk_commit_group",
- &I::genTMABulkCommitGroup,
- {{}},
- /*isElemental=*/false},
- {"tma_bulk_g2s",
- &I::genTMABulkG2S,
- {{{"barrier", asAddr},
- {"src", asAddr},
- {"dst", asAddr},
- {"nbytes", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_ldc4",
- &I::genTMABulkLoadC4,
- {{{"barrier", asAddr},
- {"src", asAddr},
- {"dst", asAddr},
- {"nelems", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_ldc8",
- &I::genTMABulkLoadC8,
- {{{"barrier", asAddr},
- {"src", asAddr},
- {"dst", asAddr},
- {"nelems", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_ldi4",
- &I::genTMABulkLoadI4,
- {{{"barrier", asAddr},
- {"src", asAddr},
- {"dst", asAddr},
- {"nelems", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_ldi8",
- &I::genTMABulkLoadI8,
- {{{"barrier", asAddr},
- {"src", asAddr},
- {"dst", asAddr},
- {"nelems", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_ldr2",
- &I::genTMABulkLoadR2,
- {{{"barrier", asAddr},
- {"src", asAddr},
- {"dst", asAddr},
- {"nelems", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_ldr4",
- &I::genTMABulkLoadR4,
- {{{"barrier", asAddr},
- {"src", asAddr},
- {"dst", asAddr},
- {"nelems", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_ldr8",
- &I::genTMABulkLoadR8,
- {{{"barrier", asAddr},
- {"src", asAddr},
- {"dst", asAddr},
- {"nelems", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_s2g",
- &I::genTMABulkS2G,
- {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_store_c4",
- &I::genTMABulkStoreC4,
- {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_store_c8",
- &I::genTMABulkStoreC8,
- {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_store_i4",
- &I::genTMABulkStoreI4,
- {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_store_i8",
- &I::genTMABulkStoreI8,
- {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_store_r2",
- &I::genTMABulkStoreR2,
- {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_store_r4",
- &I::genTMABulkStoreR4,
- {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_store_r8",
- &I::genTMABulkStoreR8,
- {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
- /*isElemental=*/false},
- {"tma_bulk_wait_group",
- &I::genTMABulkWaitGroup,
- {{}},
- /*isElemental=*/false},
{"trailz", &I::genTrailz},
{"transfer",
&I::genTransfer,
@@ -2221,6 +1876,9 @@ lookupIntrinsicHandler(fir::FirOpBuilder &builder,
if (isPPCTarget)
if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
return std::make_optional<IntrinsicHandlerEntry>(ppcHandler);
+ // TODO: Look for CUDA intrinsic handlers only if CUDA is enabled.
+ if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name))
+ return std::make_optional<IntrinsicHandlerEntry>(cudaHandler);
// Subroutines should have a handler.
if (!resultType)
return std::nullopt;
@@ -3107,159 +2765,6 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
return mlir::arith::MulFOp::create(builder, loc, atan, factor);
}
-static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
- mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
- mlir::Value arg1) {
- auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
- arg0 = builder.createConvert(loc, llvmPointerType, arg0);
- return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
- mlir::LLVM::AtomicOrdering::seq_cst);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
-
- mlir::LLVM::AtomicBinOp binOp =
- mlir::isa<mlir::IntegerType>(args[1].getType())
- ? mlir::LLVM::AtomicBinOp::add
- : mlir::LLVM::AtomicBinOp::fadd;
- return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
-
- mlir::LLVM::AtomicBinOp binOp =
- mlir::isa<mlir::IntegerType>(args[1].getType())
- ? mlir::LLVM::AtomicBinOp::sub
- : mlir::LLVM::AtomicBinOp::fsub;
- return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
- assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
- mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
- return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicOr(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
- assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
- mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
- return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICCAS
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
- auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
- auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
-
- mlir::Value arg0 = fir::getBase(args[0]);
- mlir::Value arg1 = fir::getBase(args[1]);
- mlir::Value arg2 = fir::getBase(args[2]);
-
- auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
- if (mlir::isa<mlir::Float32Type>(arg.getType()))
- return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
- arg);
- if (mlir::isa<mlir::Float64Type>(arg.getType()))
- return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
- arg);
- return arg;
- };
-
- arg1 = bitCastFloat(arg1);
- arg2 = bitCastFloat(arg2);
-
- if (arg1.getType() != arg2.getType()) {
- // arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
- arg2 = builder.createConvert(loc, arg1.getType(), arg2);
- }
-
- auto address =
- mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
- .getResult(0);
- auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
- builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
- mlir::Value boolResult =
- mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
- return builder.createConvert(loc, resultType, boolResult);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
- assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
- mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
- return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICEXCH
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicExch(mlir::Type resultType,
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 2);
- mlir::Value arg0 = fir::getBase(args[0]);
- mlir::Value arg1 = fir::getBase(args[1]);
- assert(arg1.getType().isIntOrFloat());
-
- mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
- return genAtomBinOp(builder, loc, binOp, arg0, arg1);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicInc(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
- assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
- mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
- return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicMax(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
-
- mlir::LLVM::AtomicBinOp binOp =
- mlir::isa<mlir::IntegerType>(args[1].getType())
- ? mlir::LLVM::AtomicBinOp::max
- : mlir::LLVM::AtomicBinOp::fmax;
- return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicMin(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
-
- mlir::LLVM::AtomicBinOp binOp =
- mlir::isa<mlir::IntegerType>(args[1].getType())
- ? mlir::LLVM::AtomicBinOp::min
- : mlir::LLVM::AtomicBinOp::fmin;
- return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICXOR
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicXor(mlir::Type resultType,
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 2);
- mlir::Value arg0 = fir::getBase(args[0]);
- mlir::Value arg1 = fir::getBase(args[1]);
- return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
-}
-
// ASSOCIATED
fir::ExtendedValue
IntrinsicLibrary::genAssociated(mlir::Type resultType,
@@ -3311,114 +2816,6 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox);
}
-static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
- mlir::Location loc,
- mlir::Value barrier,
- mlir::NVVM::NVVMMemorySpace space) {
- mlir::Value llvmPtr = fir::ConvertOp::create(
- builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
- barrier);
- mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
- builder, loc,
- mlir::LLVM::LLVMPointerType::get(builder.getContext(),
- static_cast<unsigned>(space)),
- llvmPtr);
- return addrCast;
-}
-
-// BARRIER_ARRIVE (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 1);
- mlir::Value barrier = convertPtrToNVVMSpace(
- builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
- return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
- barrier)
- .getResult();
-}
-
-// BARRIER_ARRIBVE_CNT (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
- mlir::Value barrier = convertPtrToNVVMSpace(
- builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
- mlir::Value token = fir::AllocaOp::create(builder, loc, resultType);
- // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and
- // currently just the sink symbol `_`.
- // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
- mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1],
- {});
- return fir::LoadOp::create(builder, loc, token);
-}
-
-// BARRIER_INIT (CUDA)
-void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 2);
- mlir::Value barrier = convertPtrToNVVMSpace(
- builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
- mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier,
- fir::getBase(args[1]), {});
- auto kind = mlir::NVVM::ProxyKindAttr::get(
- builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
- auto space = mlir::NVVM::SharedSpaceAttr::get(
- builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
- mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
-}
-
-// BARRIER_TRY_WAIT (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
- mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
- mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
- fir::StoreOp::create(builder, loc, zero, res);
- mlir::Value ns =
- builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
- mlir::Value load = fir::LoadOp::create(builder, loc, res);
- auto whileOp = mlir::scf::WhileOp::create(
- builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
- mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
- mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
- builder.setInsertionPointToStart(beforeBlock);
- mlir::Value condition = mlir::arith::CmpIOp::create(
- builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
- mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
- mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
- afterBlock->addArgument(resultType, loc);
- builder.setInsertionPointToStart(afterBlock);
- auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
- auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
- mlir::Value ret =
- mlir::NVVM::InlinePtxOp::create(
- builder, loc, {resultType}, {barrier, args[1], ns}, {},
- ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
- "selp.b32 %0, 1, 0, p;",
- {})
- .getResult(0);
- mlir::scf::YieldOp::create(builder, loc, ret);
- builder.setInsertionPointAfter(whileOp);
- return whileOp.getResult(0);
-}
-
-// BARRIER_TRY_WAIT_SLEEP (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 3);
- auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
- auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
- return mlir::NVVM::InlinePtxOp::create(
- builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
- ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
- "selp.b32 %0, 1, 0, p;",
- {})
- .getResult(0);
-}
-
// BESSEL_JN
fir::ExtendedValue
IntrinsicLibrary::genBesselJn(mlir::Type resultType,
@@ -4152,30 +3549,6 @@ IntrinsicLibrary::genCshift(mlir::Type resultType,
return readAndAddCleanUp(resultMutableBox, resultType, "CSHIFT");
}
-// __LDCA, __LDCS, __LDLU, __LDCV
-template <const char *fctName, int extent>
-fir::ExtendedValue
-IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType,
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 1);
- mlir::Type resTy = fir::SequenceType::get(extent, resultType);
- mlir::Value arg = fir::getBase(args[0]);
- mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
- if (mlir::isa<fir::BaseBoxType>(arg.getType()))
- arg = fir::BoxAddrOp::create(builder, loc, arg);
- mlir::Type refResTy = fir::ReferenceType::get(resTy);
- mlir::FunctionType ftype =
- mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
- auto funcOp = builder.createFunction(loc, fctName, ftype);
- llvm::SmallVector<mlir::Value> funcArgs;
- funcArgs.push_back(res);
- funcArgs.push_back(arg);
- fir::CallOp::create(builder, loc, funcOp, funcArgs);
- mlir::Value ext =
- builder.createIntegerConstant(loc, builder.getIndexType(), extent);
- return fir::ArrayBoxValue(res, {ext});
-}
-
// DATE_AND_TIME
void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4 && "date_and_time has 4 args");
@@ -4508,17 +3881,6 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType,
fir::getBase(args[1])));
}
-// FENCE_PROXY_ASYNC (CUDA)
-void IntrinsicLibrary::genFenceProxyAsync(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 0);
- auto kind = mlir::NVVM::ProxyKindAttr::get(
- builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
- auto space = mlir::NVVM::SharedSpaceAttr::get(
- builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
- mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
-}
-
// FINDLOC
fir::ExtendedValue
IntrinsicLibrary::genFindloc(mlir::Type resultType,
@@ -4601,6 +3963,40 @@ mlir::Value IntrinsicLibrary::genFloor(mlir::Type resultType,
return builder.createConvert(loc, resultType, floor);
}
+// FLUSH
+void IntrinsicLibrary::genFlush(llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 1);
+
+ mlir::Value unit;
+ if (isStaticallyAbsent(args[0]))
+ // Give a sentinal value of `-1` on the `()` case.
+ unit = builder.createIntegerConstant(loc, builder.getI32Type(), -1);
+ else {
+ unit = fir::getBase(args[0]);
+ if (isOptional(unit)) {
+ mlir::Value isPresent =
+ fir::IsPresentOp::create(builder, loc, builder.getI1Type(), unit);
+ unit = builder
+ .genIfOp(loc, builder.getI32Type(), isPresent,
+ /*withElseRegion=*/true)
+ .genThen([&]() {
+ mlir::Value loaded = fir::LoadOp::create(builder, loc, unit);
+ fir::ResultOp::create(builder, loc, loaded);
+ })
+ .genElse([&]() {
+ mlir::Value negOne = builder.createIntegerConstant(
+ loc, builder.getI32Type(), -1);
+ fir::ResultOp::create(builder, loc, negOne);
+ })
+ .getResults()[0];
+ } else {
+ unit = fir::LoadOp::create(builder, loc, unit);
+ }
+ }
+
+ fir::runtime::genFlush(builder, loc, unit);
+}
+
// FRACTION
mlir::Value IntrinsicLibrary::genFraction(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
@@ -4680,6 +4076,15 @@ IntrinsicLibrary::genFtell(std::optional<mlir::Type> resultType,
}
}
+// GET_TEAM
+mlir::Value IntrinsicLibrary::genGetTeam(mlir::Type resultType,
+ llvm::ArrayRef<mlir::Value> args) {
+ converter->checkCoarrayEnabled();
+ assert(args.size() == 1);
+ return mif::GetTeamOp::create(builder, loc, fir::BoxType::get(resultType),
+ /*level*/ args[0]);
+}
+
// GETCWD
fir::ExtendedValue
IntrinsicLibrary::genGetCwd(std::optional<mlir::Type> resultType,
@@ -6765,6 +6170,20 @@ IntrinsicLibrary::genIparity(mlir::Type resultType,
"IPARITY", resultType, args);
}
+// IRAND
+fir::ExtendedValue
+IntrinsicLibrary::genIrand(mlir::Type resultType,
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 1);
+ mlir::Value i =
+ isStaticallyPresent(args[0])
+ ? fir::getBase(args[0])
+ : fir::AbsentOp::create(builder, loc,
+ builder.getRefType(builder.getI32Type()))
+ .getResult();
+ return fir::runtime::genIrand(builder, loc, i);
+}
+
// IS_CONTIGUOUS
fir::ExtendedValue
IntrinsicLibrary::genIsContiguous(mlir::Type resultType,
@@ -6948,12 +6367,6 @@ IntrinsicLibrary::genCharacterCompare(mlir::Type resultType,
fir::getBase(args[1]), fir::getLen(args[1]));
}
-static bool isOptional(mlir::Value value) {
- auto varIface = mlir::dyn_cast_or_null<fir::FortranVariableOpInterface>(
- value.getDefiningOp());
- return varIface && varIface.isOptional();
-}
-
// LOC
fir::ExtendedValue
IntrinsicLibrary::genLoc(mlir::Type resultType,
@@ -7029,67 +6442,6 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType,
return result;
}
-// MATCH_ALL_SYNC
-mlir::Value
-IntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 3);
- bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
-
- mlir::Type i1Ty = builder.getI1Type();
- mlir::MLIRContext *context = builder.getContext();
-
- mlir::Value arg1 = args[1];
- if (arg1.getType().isF32() || arg1.getType().isF64())
- arg1 = fir::ConvertOp::create(
- builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
-
- mlir::Type retTy =
- mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
- auto match =
- mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
- mlir::NVVM::MatchSyncKind::all)
- .getResult();
- auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
- auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
- auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
- fir::StoreOp::create(builder, loc, conv, args[2]);
- return value;
-}
-
-// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
-template <mlir::NVVM::VoteSyncKind kind>
-mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
- mlir::Value arg1 =
- fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
- mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
- ? builder.getI32Type()
- : builder.getI1Type();
- auto voteRes =
- mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
- .getResult();
- return fir::ConvertOp::create(builder, loc, resultType, voteRes);
-}
-
-// MATCH_ANY_SYNC
-mlir::Value
-IntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 2);
- bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
-
- mlir::Value arg1 = args[1];
- if (arg1.getType().isF32() || arg1.getType().isF64())
- arg1 = fir::ConvertOp::create(
- builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
-
- return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
- arg1, mlir::NVVM::MatchSyncKind::any)
- .getResult();
-}
-
// MATMUL
fir::ExtendedValue
IntrinsicLibrary::genMatmul(mlir::Type resultType,
@@ -7237,11 +6589,9 @@ static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
auto mod = builder.getModule();
- bool dontUseFastRealMod = false;
- bool canUseApprox = mlir::arith::bitEnumContainsAny(
- builder.getFastMathFlags(), mlir::arith::FastMathFlags::afn);
- if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.no_fast_real_mod"))
- dontUseFastRealMod = attr.getValue();
+ bool useFastRealMod = false;
+ if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.fast_real_mod"))
+ useFastRealMod = attr.getValue();
assert(args.size() == 2);
if (resultType.isUnsignedInteger()) {
@@ -7254,7 +6604,7 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
if (mlir::isa<mlir::IntegerType>(resultType))
return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
- if (resultType.isFloat() && canUseApprox && !dontUseFastRealMod) {
+ if (resultType.isFloat() && useFastRealMod) {
// Treat MOD as an approximate function and code-gen inline code
// instead of calling into the Fortran runtime library.
return builder.createConvert(loc, resultType,
@@ -7707,14 +7057,6 @@ IntrinsicLibrary::genNumImages(mlir::Type resultType,
return mif::NumImagesOp::create(builder, loc).getResult();
}
-// CLOCK, CLOCK64, GLOBALTIMER
-template <typename OpTy>
-mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 0 && "expect no arguments");
- return OpTy::create(builder, loc, resultType).getResult();
-}
-
// PACK
fir::ExtendedValue
IntrinsicLibrary::genPack(mlir::Type resultType,
@@ -7868,6 +7210,19 @@ IntrinsicLibrary::genPutenv(std::optional<mlir::Type> resultType,
return {};
}
+// RAND
+fir::ExtendedValue
+IntrinsicLibrary::genRand(mlir::Type, llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 1);
+ mlir::Value i =
+ isStaticallyPresent(args[0])
+ ? fir::getBase(args[0])
+ : fir::AbsentOp::create(builder, loc,
+ builder.getRefType(builder.getI32Type()))
+ .getResult();
+ return fir::runtime::genRand(builder, loc, i);
+}
+
// RANDOM_INIT
void IntrinsicLibrary::genRandomInit(llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 2);
@@ -8533,6 +7888,16 @@ mlir::Value IntrinsicLibrary::genShiftA(mlir::Type resultType,
return result;
}
+void IntrinsicLibrary::genShowDescriptor(
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ assert(args.size() == 1 && "expected single argument for show_descriptor");
+ const mlir::Value descriptor = fir::getBase(args[0]);
+
+ assert(fir::isa_box_type(descriptor.getType()) &&
+ "argument must have been lowered to box type");
+ fir::runtime::genShowDescriptor(builder, loc, descriptor);
+}
+
// SIGNAL
void IntrinsicLibrary::genSignalSubroutine(
llvm::ArrayRef<fir::ExtendedValue> args) {
@@ -8689,90 +8054,14 @@ mlir::Value IntrinsicLibrary::genTanpi(mlir::Type resultType,
return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg});
}
-// THIS_GRID
-mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 0);
- auto recTy = mlir::cast<fir::RecordType>(resultType);
- assert(recTy && "RecordType expepected");
- mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
- mlir::Type i32Ty = builder.getI32Type();
-
- mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
- mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
- mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
-
- mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
- mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
- mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
-
- mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
- mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
- mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
- mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
- mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
- mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
-
- // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
- // (blockDim.x * gridDim.x);
- mlir::Value resZ =
- mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
- mlir::Value resY =
- mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
- mlir::Value resX =
- mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
- mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
- mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
-
- // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
- // blockIdx.x;
- // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
- // ((threadIdx.z * blockDim.y) * blockDim.x) +
- // (threadIdx.y * blockDim.x) + threadIdx.x + 1;
- mlir::Value r1 =
- mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
- mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
- mlir::Value r3 =
- mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
- mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
- mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
-
- mlir::Value bXbY =
- mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
- mlir::Value bXbYbZ =
- mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
- mlir::Value tZbY =
- mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
- mlir::Value tZbYbX =
- mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
- mlir::Value tYbX =
- mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
- mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
- rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
- rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
- rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
- mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
- rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
-
- auto sizeFieldName = recTy.getTypeList()[1].first;
- mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
- mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
- mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
- builder, loc, fieldIndexType, sizeFieldName, recTy,
- /*typeParams=*/mlir::ValueRange{});
- mlir::Value sizeCoord = fir::CoordinateOp::create(
- builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
- fir::StoreOp::create(builder, loc, size, sizeCoord);
-
- auto rankFieldName = recTy.getTypeList()[2].first;
- mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
- mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
- builder, loc, fieldIndexType, rankFieldName, recTy,
- /*typeParams=*/mlir::ValueRange{});
- mlir::Value rankCoord = fir::CoordinateOp::create(
- builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
- fir::StoreOp::create(builder, loc, rank, rankCoord);
- return res;
+// TEAM_NUMBER
+fir::ExtendedValue
+IntrinsicLibrary::genTeamNumber(mlir::Type,
+ llvm::ArrayRef<fir::ExtendedValue> args) {
+ converter->checkCoarrayEnabled();
+ assert(args.size() == 1);
+ return mif::TeamNumberOp::create(builder, loc,
+ /*team*/ fir::getBase(args[0]));
}
// THIS_IMAGE
@@ -8790,99 +8079,6 @@ IntrinsicLibrary::genThisImage(mlir::Type resultType,
return builder.createConvert(loc, resultType, res);
}
-// THIS_THREAD_BLOCK
-mlir::Value
-IntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 0);
- auto recTy = mlir::cast<fir::RecordType>(resultType);
- assert(recTy && "RecordType expepected");
- mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
- mlir::Type i32Ty = builder.getI32Type();
-
- // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
- mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
- mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
- mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
- mlir::Value size =
- mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
- size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
-
- // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
- // (threadIdx.y * blockDim.x) + threadIdx.x + 1;
- mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
- mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
- mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
- mlir::Value r1 =
- mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
- mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
- mlir::Value r3 =
- mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
- mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
- mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
- mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
- rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
-
- auto sizeFieldName = recTy.getTypeList()[1].first;
- mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
- mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
- mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
- builder, loc, fieldIndexType, sizeFieldName, recTy,
- /*typeParams=*/mlir::ValueRange{});
- mlir::Value sizeCoord = fir::CoordinateOp::create(
- builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
- fir::StoreOp::create(builder, loc, size, sizeCoord);
-
- auto rankFieldName = recTy.getTypeList()[2].first;
- mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
- mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
- builder, loc, fieldIndexType, rankFieldName, recTy,
- /*typeParams=*/mlir::ValueRange{});
- mlir::Value rankCoord = fir::CoordinateOp::create(
- builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
- fir::StoreOp::create(builder, loc, rank, rankCoord);
- return res;
-}
-
-// THIS_WARP
-mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- assert(args.size() == 0);
- auto recTy = mlir::cast<fir::RecordType>(resultType);
- assert(recTy && "RecordType expepected");
- mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
- mlir::Type i32Ty = builder.getI32Type();
-
- // coalesced_group%size = 32
- mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
- auto sizeFieldName = recTy.getTypeList()[1].first;
- mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
- mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
- mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
- builder, loc, fieldIndexType, sizeFieldName, recTy,
- /*typeParams=*/mlir::ValueRange{});
- mlir::Value sizeCoord = fir::CoordinateOp::create(
- builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
- fir::StoreOp::create(builder, loc, size, sizeCoord);
-
- // coalesced_group%rank = threadIdx.x & 31 + 1
- mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
- mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
- mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
- mlir::Value masked =
- mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
- mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
- auto rankFieldName = recTy.getTypeList()[2].first;
- mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
- mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
- builder, loc, fieldIndexType, rankFieldName, recTy,
- /*typeParams=*/mlir::ValueRange{});
- mlir::Value rankCoord = fir::CoordinateOp::create(
- builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
- fir::StoreOp::create(builder, loc, rank, rankCoord);
- return res;
-}
-
// TRAILZ
mlir::Value IntrinsicLibrary::genTrailz(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
@@ -9104,65 +8300,6 @@ IntrinsicLibrary::genSum(mlir::Type resultType,
resultType, args);
}
-// SYNCTHREADS
-void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef<fir::ExtendedValue> args) {
- mlir::NVVM::Barrier0Op::create(builder, loc);
-}
-
-// SYNCTHREADS_AND
-mlir::Value
-IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
- mlir::MLIRContext *context = builder.getContext();
- mlir::Type i32 = builder.getI32Type();
- mlir::FunctionType ftype =
- mlir::FunctionType::get(context, {resultType}, {i32});
- auto funcOp = builder.createFunction(loc, funcName, ftype);
- mlir::Value arg = builder.createConvert(loc, i32, args[0]);
- return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCTHREADS_COUNT
-mlir::Value
-IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
- mlir::MLIRContext *context = builder.getContext();
- mlir::Type i32 = builder.getI32Type();
- mlir::FunctionType ftype =
- mlir::FunctionType::get(context, {resultType}, {i32});
- auto funcOp = builder.createFunction(loc, funcName, ftype);
- mlir::Value arg = builder.createConvert(loc, i32, args[0]);
- return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCTHREADS_OR
-mlir::Value
-IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
- llvm::ArrayRef<mlir::Value> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
- mlir::MLIRContext *context = builder.getContext();
- mlir::Type i32 = builder.getI32Type();
- mlir::FunctionType ftype =
- mlir::FunctionType::get(context, {resultType}, {i32});
- auto funcOp = builder.createFunction(loc, funcName, ftype);
- mlir::Value arg = builder.createConvert(loc, i32, args[0]);
- return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCWARP
-void IntrinsicLibrary::genSyncWarp(llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 1);
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync";
- mlir::Value mask = fir::getBase(args[0]);
- mlir::FunctionType funcType =
- mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {});
- auto funcOp = builder.createFunction(loc, funcName, funcType);
- llvm::SmallVector<mlir::Value> argsList{mask};
- fir::CallOp::create(builder, loc, funcOp, argsList);
-}
-
// SYSTEM
fir::ExtendedValue
IntrinsicLibrary::genSystem(std::optional<mlir::Type> resultType,
@@ -9294,38 +8431,6 @@ IntrinsicLibrary::genTranspose(mlir::Type resultType,
return readAndAddCleanUp(resultMutableBox, resultType, "TRANSPOSE");
}
-// THREADFENCE
-void IntrinsicLibrary::genThreadFence(llvm::ArrayRef<fir::ExtendedValue> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
- mlir::FunctionType funcType =
- mlir::FunctionType::get(builder.getContext(), {}, {});
- auto funcOp = builder.createFunction(loc, funcName, funcType);
- llvm::SmallVector<mlir::Value> noArgs;
- fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
-// THREADFENCE_BLOCK
-void IntrinsicLibrary::genThreadFenceBlock(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
- mlir::FunctionType funcType =
- mlir::FunctionType::get(builder.getContext(), {}, {});
- auto funcOp = builder.createFunction(loc, funcName, funcType);
- llvm::SmallVector<mlir::Value> noArgs;
- fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
-// THREADFENCE_SYSTEM
-void IntrinsicLibrary::genThreadFenceSystem(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
- mlir::FunctionType funcType =
- mlir::FunctionType::get(builder.getContext(), {}, {});
- auto funcOp = builder.createFunction(loc, funcName, funcType);
- llvm::SmallVector<mlir::Value> noArgs;
- fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
// TIME
mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
@@ -9334,226 +8439,6 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
fir::runtime::genTime(builder, loc));
}
-// TMA_BULK_COMMIT_GROUP (CUDA)
-void IntrinsicLibrary::genTMABulkCommitGroup(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 0);
- mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
-}
-
-// TMA_BULK_G2S (CUDA)
-void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 4);
- mlir::Value barrier = convertPtrToNVVMSpace(
- builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
- mlir::Value dst =
- convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
- mlir::NVVM::NVVMMemorySpace::SharedCluster);
- mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
- mlir::NVVM::NVVMMemorySpace::Global);
- mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
- builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
-}
-
-static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
- mlir::Value barrier, mlir::Value src,
- mlir::Value dst, mlir::Value nelem,
- mlir::Value eleSize) {
- mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
- auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
- barrier = builder.createConvert(loc, llvmPtrTy, barrier);
- dst = builder.createConvert(loc, llvmPtrTy, dst);
- src = builder.createConvert(loc, llvmPtrTy, src);
- mlir::NVVM::InlinePtxOp::create(
- builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
- "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
- "[%1], %2, [%3];",
- {});
- mlir::NVVM::InlinePtxOp::create(
- builder, loc, mlir::TypeRange{}, {barrier, size}, {},
- "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
-}
-
-// TMA_BULK_LOADC4
-void IntrinsicLibrary::genTMABulkLoadC4(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 4);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 8);
- genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADC8
-void IntrinsicLibrary::genTMABulkLoadC8(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 4);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 16);
- genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADI4
-void IntrinsicLibrary::genTMABulkLoadI4(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 4);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 4);
- genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADI8
-void IntrinsicLibrary::genTMABulkLoadI8(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 4);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 8);
- genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR2
-void IntrinsicLibrary::genTMABulkLoadR2(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 4);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 2);
- genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR4
-void IntrinsicLibrary::genTMABulkLoadR4(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 4);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 4);
- genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR8
-void IntrinsicLibrary::genTMABulkLoadR8(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 4);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 8);
- genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_S2G (CUDA)
-void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
- mlir::NVVM::NVVMMemorySpace::Shared);
- mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
- mlir::NVVM::NVVMMemorySpace::Global);
- mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
- builder, loc, dst, src, fir::getBase(args[2]), {}, {});
-
- mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
- "cp.async.bulk.commit_group", {});
- mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
- builder.getI32IntegerAttr(0), {});
-}
-
-static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
- mlir::Value src, mlir::Value dst, mlir::Value count,
- mlir::Value eleSize) {
- mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
- src = convertPtrToNVVMSpace(builder, loc, src,
- mlir::NVVM::NVVMMemorySpace::Shared);
- dst = convertPtrToNVVMSpace(builder, loc, dst,
- mlir::NVVM::NVVMMemorySpace::Global);
- mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
- size, {}, {});
- mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
- "cp.async.bulk.commit_group", {});
- mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
- builder.getI32IntegerAttr(0), {});
-}
-
-// TMA_BULK_STORE_C4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreC4(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 8);
- genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_C8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreC8(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 16);
- genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_I4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreI4(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 4);
- genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_I8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreI8(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 8);
- genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R2 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR2(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 2);
- genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR4(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 4);
- genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR8(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 3);
- mlir::Value eleSize =
- builder.createIntegerConstant(loc, builder.getI32Type(), 8);
- genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
- fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_WAIT_GROUP (CUDA)
-void IntrinsicLibrary::genTMABulkWaitGroup(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 0);
- auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
- mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
-}
-
// TRIM
fir::ExtendedValue
IntrinsicLibrary::genTrim(mlir::Type resultType,
@@ -9968,6 +8853,9 @@ getIntrinsicArgumentLowering(llvm::StringRef specificName) {
if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
if (!ppcHandler->argLoweringRules.hasDefaultRules())
return &ppcHandler->argLoweringRules;
+ if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name))
+ if (!cudaHandler->argLoweringRules.hasDefaultRules())
+ return &cudaHandler->argLoweringRules;
return nullptr;
}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Character.cpp b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
index 540ecba..2f1772f 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
@@ -94,27 +94,34 @@ fir::runtime::genCharCompare(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::arith::CmpIPredicate cmp,
mlir::Value lhsBuff, mlir::Value lhsLen,
mlir::Value rhsBuff, mlir::Value rhsLen) {
- mlir::func::FuncOp beginFunc;
- switch (discoverKind(lhsBuff.getType())) {
+ int lhsKind = discoverKind(lhsBuff.getType());
+ int rhsKind = discoverKind(rhsBuff.getType());
+ if (lhsKind != rhsKind) {
+ fir::emitFatalError(loc, "runtime does not support comparison of different "
+ "CHARACTER kind values");
+ }
+ mlir::func::FuncOp func;
+ switch (lhsKind) {
case 1:
- beginFunc = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar1)>(
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar1)>(
loc, builder);
break;
case 2:
- beginFunc = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar2)>(
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar2)>(
loc, builder);
break;
case 4:
- beginFunc = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar4)>(
+ func = fir::runtime::getRuntimeFunc<mkRTKey(CharacterCompareScalar4)>(
loc, builder);
break;
default:
- llvm_unreachable("runtime does not support CHARACTER KIND");
+ fir::emitFatalError(
+ loc, "unsupported CHARACTER kind value. Runtime expects 1, 2, or 4.");
}
- auto fTy = beginFunc.getFunctionType();
+ auto fTy = func.getFunctionType();
auto args = fir::runtime::createArguments(builder, loc, fTy, lhsBuff, rhsBuff,
lhsLen, rhsLen);
- auto tri = fir::CallOp::create(builder, loc, beginFunc, args).getResult(0);
+ auto tri = fir::CallOp::create(builder, loc, func, args).getResult(0);
auto zero = builder.createIntegerConstant(loc, tri.getType(), 0);
return mlir::arith::CmpIOp::create(builder, loc, cmp, tri, zero);
}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index 110b1b2..a5f16f8 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -137,6 +137,15 @@ void fir::runtime::genEtime(fir::FirOpBuilder &builder, mlir::Location loc,
fir::CallOp::create(builder, loc, runtimeFunc, args);
}
+void fir::runtime::genFlush(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Value unit) {
+ auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Flush)>(loc, builder);
+ llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+ builder, loc, runtimeFunc.getFunctionType(), unit);
+
+ fir::CallOp::create(builder, loc, runtimeFunc, args);
+}
+
void fir::runtime::genFree(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::Value ptr) {
auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Free)>(loc, builder);
@@ -461,3 +470,34 @@ mlir::Value fir::runtime::genChdir(fir::FirOpBuilder &builder,
fir::runtime::createArguments(builder, loc, func.getFunctionType(), name);
return fir::CallOp::create(builder, loc, func, args).getResult(0);
}
+
+mlir::Value fir::runtime::genIrand(fir::FirOpBuilder &builder,
+ mlir::Location loc, mlir::Value i) {
+ auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Irand)>(loc, builder);
+ mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType();
+
+ llvm::SmallVector<mlir::Value> args =
+ fir::runtime::createArguments(builder, loc, runtimeFuncTy, i);
+ return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
+}
+
+mlir::Value fir::runtime::genRand(fir::FirOpBuilder &builder,
+ mlir::Location loc, mlir::Value i) {
+ auto runtimeFunc = fir::runtime::getRuntimeFunc<mkRTKey(Rand)>(loc, builder);
+ mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType();
+
+ mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+ mlir::Value sourceLine =
+ fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(2));
+
+ llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+ builder, loc, runtimeFuncTy, i, sourceFile, sourceLine);
+ return fir::CallOp::create(builder, loc, runtimeFunc, args).getResult(0);
+}
+
+void fir::runtime::genShowDescriptor(fir::FirOpBuilder &builder,
+ mlir::Location loc, mlir::Value descAddr) {
+ mlir::func::FuncOp func{
+ fir::runtime::getRuntimeFunc<mkRTKey(ShowDescriptor)>(loc, builder)};
+ fir::CallOp::create(builder, loc, func, descAddr);
+}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index 157d435..343d848 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -1841,7 +1841,7 @@ mlir::Value fir::runtime::genReduce(fir::FirOpBuilder &builder,
assert((fir::isa_real(eleTy) || fir::isa_integer(eleTy) ||
mlir::isa<fir::LogicalType>(eleTy)) &&
- "expect real, interger or logical");
+ "expect real, integer or logical");
auto [cat, kind] = fir::mlirTypeToCategoryKind(loc, eleTy);
mlir::func::FuncOp func;
diff --git a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
index 7e329e3..5db40af 100644
--- a/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
+++ b/flang/lib/Optimizer/Builder/TemporaryStorage.cpp
@@ -258,13 +258,9 @@ void fir::factory::AnyVariableStack::pushValue(mlir::Location loc,
fir::FirOpBuilder &builder,
mlir::Value variable) {
hlfir::Entity entity{variable};
- mlir::Type storageElementType =
- hlfir::getFortranElementType(retValueBox.getType());
- auto [box, maybeCleanUp] =
- hlfir::convertToBox(loc, builder, entity, storageElementType);
+ mlir::Value box =
+ hlfir::genVariableBox(loc, builder, entity, entity.getBoxType());
fir::runtime::genPushDescriptor(loc, builder, opaquePtr, fir::getBase(box));
- if (maybeCleanUp)
- (*maybeCleanUp)();
}
void fir::factory::AnyVariableStack::resetFetchPosition(