diff options
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 617 | ||||
-rw-r--r-- | llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 1412 | ||||
-rw-r--r-- | llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp | 19 |
3 files changed, 2000 insertions, 48 deletions
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index bff49da..883699d 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -16,6 +16,7 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/Allocator.h" @@ -99,7 +100,10 @@ public: /// expanded. std::optional<bool> IsGPU; - // Flag for specifying if offloading is mandatory. + /// Flag for specifying if LLVMUsed information should be emitted. + std::optional<bool> EmitLLVMUsedMetaInfo; + + /// Flag for specifying if offloading is mandatory. std::optional<bool> OpenMPOffloadMandatory; /// First separator used between the initial two parts of a name. @@ -107,6 +111,9 @@ public: /// Separator used between all of the rest consecutive parts of s name std::optional<StringRef> Separator; + // Grid Value for the GPU target + std::optional<omp::GV> GridValue; + OpenMPIRBuilderConfig(); OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory, @@ -132,6 +139,11 @@ public: return *OpenMPOffloadMandatory; } + omp::GV getGridValue() const { + assert(GridValue.has_value() && "GridValue is not set"); + return *GridValue; + } + bool hasRequiresFlags() const { return RequiresFlags; } bool hasRequiresReverseOffload() const; bool hasRequiresUnifiedAddress() const; @@ -164,9 +176,11 @@ public: void setIsTargetDevice(bool Value) { IsTargetDevice = Value; } void setIsGPU(bool Value) { IsGPU = Value; } + void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; } void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; } void setFirstSeparator(StringRef FS) { FirstSeparator = FS; } void setSeparator(StringRef S) { Separator = S; } + void setGridValue(omp::GV G) { GridValue = G; } void setHasRequiresReverseOffload(bool Value); void setHasRequiresUnifiedAddress(bool Value); @@ -607,14 +621,15 @@ public: /// Generator for '#omp barrier' /// /// \param Loc The location where the barrier directive was encountered. - /// \param DK The kind of directive that caused the barrier. + /// \param Kind The kind of directive that caused the barrier. /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier. /// \param CheckCancelFlag Flag to indicate a cancel barrier return value /// should be checked and acted upon. + /// \param ThreadID Optional parameter to pass in any existing ThreadID value. /// /// \returns The insertion point after the barrier. - InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK, - bool ForceSimpleCall = false, + InsertPointTy createBarrier(const LocationDescription &Loc, + omp::Directive Kind, bool ForceSimpleCall = false, bool CheckCancelFlag = true); /// Generator for '#omp cancel' @@ -1235,27 +1250,55 @@ public: getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName = ""); - /// Functions used to generate reductions. Such functions take two Values - /// representing LHS and RHS of the reduction, respectively, and a reference - /// to the value that is updated to refer to the reduction result. - using ReductionGenTy = - function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>; + /// Enum class for the RedctionGen CallBack type to be used. + enum class ReductionGenCBKind { Clang, MLIR }; + + /// ReductionGen CallBack for Clang + /// + /// \param CodeGenIP InsertPoint for CodeGen. + /// \param Index Index of the ReductionInfo to generate code for. + /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for + /// codegen, used for fixup later. + /// \param RHSPtr Optionally used by Clang to + /// return the RHSPtr it used for codegen, used for fixup later. + /// \param CurFn Optionally used by Clang to pass in the Current Function as + /// Clang context may be old. + using ReductionGenClangCBTy = + std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index, + Value **LHS, Value **RHS, Function *CurFn)>; + + /// ReductionGen CallBack for MLIR + /// + /// \param CodeGenIP InsertPoint for CodeGen. + /// \param LHS Pass in the LHS Value to be used for CodeGen. + /// \param RHS Pass in the RHS Value to be used for CodeGen. + using ReductionGenCBTy = std::function<InsertPointTy( + InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>; /// Functions used to generate atomic reductions. Such functions take two /// Values representing pointers to LHS and RHS of the reduction, as well as /// the element type of these pointers. They are expected to atomically /// update the LHS to the reduced value. - using AtomicReductionGenTy = - function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>; + using ReductionGenAtomicCBTy = + std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>; + + /// Enum class for reduction evaluation types scalar, complex and aggregate. + enum class EvalKind { Scalar, Complex, Aggregate }; /// Information about an OpenMP reduction. struct ReductionInfo { ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, - ReductionGenTy ReductionGen, - AtomicReductionGenTy AtomicReductionGen) + EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, + ReductionGenClangCBTy ReductionGenClang, + ReductionGenAtomicCBTy AtomicReductionGen) : ElementType(ElementType), Variable(Variable), - PrivateVariable(PrivateVariable), ReductionGen(ReductionGen), + PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind), + ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang), AtomicReductionGen(AtomicReductionGen) {} + ReductionInfo(Value *PrivateVariable) + : ElementType(nullptr), Variable(nullptr), + PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar), + ReductionGen(), ReductionGenClang(), AtomicReductionGen() {} /// Reduction element type, must match pointee type of variable. Type *ElementType; @@ -1266,18 +1309,547 @@ public: /// Thread-private partial reduction variable. Value *PrivateVariable; + /// Reduction evaluation kind - scalar, complex or aggregate. + EvalKind EvaluationKind; + /// Callback for generating the reduction body. The IR produced by this will /// be used to combine two values in a thread-safe context, e.g., under /// lock or within the same thread, and therefore need not be atomic. - ReductionGenTy ReductionGen; + ReductionGenCBTy ReductionGen; + + /// Clang callback for generating the reduction body. The IR produced by + /// this will be used to combine two values in a thread-safe context, e.g., + /// under lock or within the same thread, and therefore need not be atomic. + ReductionGenClangCBTy ReductionGenClang; /// Callback for generating the atomic reduction body, may be null. The IR /// produced by this will be used to atomically combine two values during /// reduction. If null, the implementation will use the non-atomic version /// along with the appropriate synchronization mechanisms. - AtomicReductionGenTy AtomicReductionGen; + ReductionGenAtomicCBTy AtomicReductionGen; + }; + + enum class CopyAction : unsigned { + // RemoteLaneToThread: Copy over a Reduce list from a remote lane in + // the warp using shuffle instructions. + RemoteLaneToThread, + // ThreadCopy: Make a copy of a Reduce list on the thread's stack. + ThreadCopy, + }; + + struct CopyOptionsTy { + Value *RemoteLaneOffset = nullptr; + Value *ScratchpadIndex = nullptr; + Value *ScratchpadWidth = nullptr; }; + /// Supporting functions for Reductions CodeGen. +private: + /// Emit the llvm.used metadata. + void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List); + + /// Get the id of the current thread on the GPU. + Value *getGPUThreadID(); + + /// Get the GPU warp size. + Value *getGPUWarpSize(); + + /// Get the id of the warp in the block. + /// We assume that the warp size is 32, which is always the case + /// on the NVPTX device, to generate more efficient code. + Value *getNVPTXWarpID(); + + /// Get the id of the current lane in the Warp. + /// We assume that the warp size is 32, which is always the case + /// on the NVPTX device, to generate more efficient code. + Value *getNVPTXLaneID(); + + /// Cast value to the specified type. + Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType); + + /// This function creates calls to one of two shuffle functions to copy + /// variables between lanes in a warp. + Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element, + Type *ElementType, Value *Offset); + + /// Function to shuffle over the value from the remote lane. + void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, + Type *ElementType, Value *Offset, + Type *ReductionArrayTy); + + /// Emit instructions to copy a Reduce list, which contains partially + /// aggregated values, in the specified direction. + void emitReductionListCopy( + InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, + ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, + CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}); + + /// Emit a helper that reduces data across two OpenMP threads (lanes) + /// in the same warp. It uses shuffle instructions to copy over data from + /// a remote lane's stack. The reduction algorithm performed is specified + /// by the fourth parameter. + /// + /// Algorithm Versions. + /// Full Warp Reduce (argument value 0): + /// This algorithm assumes that all 32 lanes are active and gathers + /// data from these 32 lanes, producing a single resultant value. + /// Contiguous Partial Warp Reduce (argument value 1): + /// This algorithm assumes that only a *contiguous* subset of lanes + /// are active. This happens for the last warp in a parallel region + /// when the user specified num_threads is not an integer multiple of + /// 32. This contiguous subset always starts with the zeroth lane. + /// Partial Warp Reduce (argument value 2): + /// This algorithm gathers data from any number of lanes at any position. + /// All reduced values are stored in the lowest possible lane. The set + /// of problems every algorithm addresses is a super set of those + /// addressable by algorithms with a lower version number. Overhead + /// increases as algorithm version increases. + /// + /// Terminology + /// Reduce element: + /// Reduce element refers to the individual data field with primitive + /// data types to be combined and reduced across threads. + /// Reduce list: + /// Reduce list refers to a collection of local, thread-private + /// reduce elements. + /// Remote Reduce list: + /// Remote Reduce list refers to a collection of remote (relative to + /// the current thread) reduce elements. + /// + /// We distinguish between three states of threads that are important to + /// the implementation of this function. + /// Alive threads: + /// Threads in a warp executing the SIMT instruction, as distinguished from + /// threads that are inactive due to divergent control flow. + /// Active threads: + /// The minimal set of threads that has to be alive upon entry to this + /// function. The computation is correct iff active threads are alive. + /// Some threads are alive but they are not active because they do not + /// contribute to the computation in any useful manner. Turning them off + /// may introduce control flow overheads without any tangible benefits. + /// Effective threads: + /// In order to comply with the argument requirements of the shuffle + /// function, we must keep all lanes holding data alive. But at most + /// half of them perform value aggregation; we refer to this half of + /// threads as effective. The other half is simply handing off their + /// data. + /// + /// Procedure + /// Value shuffle: + /// In this step active threads transfer data from higher lane positions + /// in the warp to lower lane positions, creating Remote Reduce list. + /// Value aggregation: + /// In this step, effective threads combine their thread local Reduce list + /// with Remote Reduce list and store the result in the thread local + /// Reduce list. + /// Value copy: + /// In this step, we deal with the assumption made by algorithm 2 + /// (i.e. contiguity assumption). When we have an odd number of lanes + /// active, say 2k+1, only k threads will be effective and therefore k + /// new values will be produced. However, the Reduce list owned by the + /// (2k+1)th thread is ignored in the value aggregation. Therefore + /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so + /// that the contiguity assumption still holds. + /// + /// \param ReductionInfos Array type containing the ReductionOps. + /// \param ReduceFn The reduction function. + /// \param FuncAttrs Optional param to specify any function attributes that + /// need to be copied to the new function. + /// + /// \return The ShuffleAndReduce function. + Function *emitShuffleAndReduceFunction( + ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, + Function *ReduceFn, AttributeList FuncAttrs); + + /// This function emits a helper that gathers Reduce lists from the first + /// lane of every active warp to lanes in the first warp. + /// + /// void inter_warp_copy_func(void* reduce_data, num_warps) + /// shared smem[warp_size]; + /// For all data entries D in reduce_data: + /// sync + /// If (I am the first lane in each warp) + /// Copy my local D to smem[warp_id] + /// sync + /// if (I am the first warp) + /// Copy smem[thread_id] to my local D + /// + /// \param Loc The insert and source location description. + /// \param ReductionInfos Array type containing the ReductionOps. + /// \param FuncAttrs Optional param to specify any function attributes that + /// need to be copied to the new function. + /// + /// \return The InterWarpCopy function. + Function *emitInterWarpCopyFunction(const LocationDescription &Loc, + ArrayRef<ReductionInfo> ReductionInfos, + AttributeList FuncAttrs); + + /// This function emits a helper that copies all the reduction variables from + /// the team into the provided global buffer for the reduction variables. + /// + /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) + /// For all data entries D in reduce_data: + /// Copy local D to buffer.D[Idx] + /// + /// \param ReductionInfos Array type containing the ReductionOps. + /// \param ReductionsBufferTy The StructTy for the reductions buffer. + /// \param FuncAttrs Optional param to specify any function attributes that + /// need to be copied to the new function. + /// + /// \return The ListToGlobalCopy function. + Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos, + Type *ReductionsBufferTy, + AttributeList FuncAttrs); + + /// This function emits a helper that copies all the reduction variables from + /// the team into the provided global buffer for the reduction variables. + /// + /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) + /// For all data entries D in reduce_data: + /// Copy buffer.D[Idx] to local D; + /// + /// \param ReductionInfos Array type containing the ReductionOps. + /// \param ReductionsBufferTy The StructTy for the reductions buffer. + /// \param FuncAttrs Optional param to specify any function attributes that + /// need to be copied to the new function. + /// + /// \return The GlobalToList function. + Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos, + Type *ReductionsBufferTy, + AttributeList FuncAttrs); + + /// This function emits a helper that reduces all the reduction variables from + /// the team into the provided global buffer for the reduction variables. + /// + /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) + /// void *GlobPtrs[]; + /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; + /// ... + /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; + /// reduce_function(GlobPtrs, reduce_data); + /// + /// \param ReductionInfos Array type containing the ReductionOps. + /// \param ReduceFn The reduction function. + /// \param ReductionsBufferTy The StructTy for the reductions buffer. + /// \param FuncAttrs Optional param to specify any function attributes that + /// need to be copied to the new function. + /// + /// \return The ListToGlobalReduce function. + Function * + emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos, + Function *ReduceFn, Type *ReductionsBufferTy, + AttributeList FuncAttrs); + + /// This function emits a helper that reduces all the reduction variables from + /// the team into the provided global buffer for the reduction variables. + /// + /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) + /// void *GlobPtrs[]; + /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; + /// ... + /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; + /// reduce_function(reduce_data, GlobPtrs); + /// + /// \param ReductionInfos Array type containing the ReductionOps. + /// \param ReduceFn The reduction function. + /// \param ReductionsBufferTy The StructTy for the reductions buffer. + /// \param FuncAttrs Optional param to specify any function attributes that + /// need to be copied to the new function. + /// + /// \return The GlobalToListReduce function. + Function * + emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos, + Function *ReduceFn, Type *ReductionsBufferTy, + AttributeList FuncAttrs); + + /// Get the function name of a reduction function. + std::string getReductionFuncName(StringRef Name) const; + + /// Emits reduction function. + /// \param ReducerName Name of the function calling the reduction. + /// \param ReductionInfos Array type containing the ReductionOps. + /// \param ReductionGenCBKind Optional param to specify Clang or MLIR + /// CodeGenCB kind. + /// \param FuncAttrs Optional param to specify any function attributes that + /// need to be copied to the new function. + /// + /// \return The reduction function. + Function *createReductionFunction( + StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, + ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, + AttributeList FuncAttrs = {}); + +public: + /// + /// Design of OpenMP reductions on the GPU + /// + /// Consider a typical OpenMP program with one or more reduction + /// clauses: + /// + /// float foo; + /// double bar; + /// #pragma omp target teams distribute parallel for \ + /// reduction(+:foo) reduction(*:bar) + /// for (int i = 0; i < N; i++) { + /// foo += A[i]; bar *= B[i]; + /// } + /// + /// where 'foo' and 'bar' are reduced across all OpenMP threads in + /// all teams. In our OpenMP implementation on the NVPTX device an + /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads + /// within a team are mapped to CUDA threads within a threadblock. + /// Our goal is to efficiently aggregate values across all OpenMP + /// threads such that: + /// + /// - the compiler and runtime are logically concise, and + /// - the reduction is performed efficiently in a hierarchical + /// manner as follows: within OpenMP threads in the same warp, + /// across warps in a threadblock, and finally across teams on + /// the NVPTX device. + /// + /// Introduction to Decoupling + /// + /// We would like to decouple the compiler and the runtime so that the + /// latter is ignorant of the reduction variables (number, data types) + /// and the reduction operators. This allows a simpler interface + /// and implementation while still attaining good performance. + /// + /// Pseudocode for the aforementioned OpenMP program generated by the + /// compiler is as follows: + /// + /// 1. Create private copies of reduction variables on each OpenMP + /// thread: 'foo_private', 'bar_private' + /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned + /// to it and writes the result in 'foo_private' and 'bar_private' + /// respectively. + /// 3. Call the OpenMP runtime on the GPU to reduce within a team + /// and store the result on the team master: + /// + /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., + /// reduceData, shuffleReduceFn, interWarpCpyFn) + /// + /// where: + /// struct ReduceData { + /// double *foo; + /// double *bar; + /// } reduceData + /// reduceData.foo = &foo_private + /// reduceData.bar = &bar_private + /// + /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two + /// auxiliary functions generated by the compiler that operate on + /// variables of type 'ReduceData'. They aid the runtime perform + /// algorithmic steps in a data agnostic manner. + /// + /// 'shuffleReduceFn' is a pointer to a function that reduces data + /// of type 'ReduceData' across two OpenMP threads (lanes) in the + /// same warp. It takes the following arguments as input: + /// + /// a. variable of type 'ReduceData' on the calling lane, + /// b. its lane_id, + /// c. an offset relative to the current lane_id to generate a + /// remote_lane_id. The remote lane contains the second + /// variable of type 'ReduceData' that is to be reduced. + /// d. an algorithm version parameter determining which reduction + /// algorithm to use. + /// + /// 'shuffleReduceFn' retrieves data from the remote lane using + /// efficient GPU shuffle intrinsics and reduces, using the + /// algorithm specified by the 4th parameter, the two operands + /// element-wise. The result is written to the first operand. + /// + /// Different reduction algorithms are implemented in different + /// runtime functions, all calling 'shuffleReduceFn' to perform + /// the essential reduction step. Therefore, based on the 4th + /// parameter, this function behaves slightly differently to + /// cooperate with the runtime to ensure correctness under + /// different circumstances. + /// + /// 'InterWarpCpyFn' is a pointer to a function that transfers + /// reduced variables across warps. It tunnels, through CUDA + /// shared memory, the thread-private data of type 'ReduceData' + /// from lane 0 of each warp to a lane in the first warp. + /// 4. Call the OpenMP runtime on the GPU to reduce across teams. + /// The last team writes the global reduced value to memory. + /// + /// ret = __kmpc_nvptx_teams_reduce_nowait(..., + /// reduceData, shuffleReduceFn, interWarpCpyFn, + /// scratchpadCopyFn, loadAndReduceFn) + /// + /// 'scratchpadCopyFn' is a helper that stores reduced + /// data from the team master to a scratchpad array in + /// global memory. + /// + /// 'loadAndReduceFn' is a helper that loads data from + /// the scratchpad array and reduces it with the input + /// operand. + /// + /// These compiler generated functions hide address + /// calculation and alignment information from the runtime. + /// 5. if ret == 1: + /// The team master of the last team stores the reduced + /// result to the globals in memory. + /// foo += reduceData.foo; bar *= reduceData.bar + /// + /// + /// Warp Reduction Algorithms + /// + /// On the warp level, we have three algorithms implemented in the + /// OpenMP runtime depending on the number of active lanes: + /// + /// Full Warp Reduction + /// + /// The reduce algorithm within a warp where all lanes are active + /// is implemented in the runtime as follows: + /// + /// full_warp_reduce(void *reduce_data, + /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { + /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) + /// ShuffleReduceFn(reduce_data, 0, offset, 0); + /// } + /// + /// The algorithm completes in log(2, WARPSIZE) steps. + /// + /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is + /// not used therefore we save instructions by not retrieving lane_id + /// from the corresponding special registers. The 4th parameter, which + /// represents the version of the algorithm being used, is set to 0 to + /// signify full warp reduction. + /// + /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: + /// + /// #reduce_elem refers to an element in the local lane's data structure + /// #remote_elem is retrieved from a remote lane + /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); + /// reduce_elem = reduce_elem REDUCE_OP remote_elem; + /// + /// Contiguous Partial Warp Reduction + /// + /// This reduce algorithm is used within a warp where only the first + /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the + /// number of OpenMP threads in a parallel region is not a multiple of + /// WARPSIZE. The algorithm is implemented in the runtime as follows: + /// + /// void + /// contiguous_partial_reduce(void *reduce_data, + /// kmp_ShuffleReductFctPtr ShuffleReduceFn, + /// int size, int lane_id) { + /// int curr_size; + /// int offset; + /// curr_size = size; + /// mask = curr_size/2; + /// while (offset>0) { + /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); + /// curr_size = (curr_size+1)/2; + /// offset = curr_size/2; + /// } + /// } + /// + /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: + /// + /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); + /// if (lane_id < offset) + /// reduce_elem = reduce_elem REDUCE_OP remote_elem + /// else + /// reduce_elem = remote_elem + /// + /// This algorithm assumes that the data to be reduced are located in a + /// contiguous subset of lanes starting from the first. When there is + /// an odd number of active lanes, the data in the last lane is not + /// aggregated with any other lane's dat but is instead copied over. + /// + /// Dispersed Partial Warp Reduction + /// + /// This algorithm is used within a warp when any discontiguous subset of + /// lanes are active. It is used to implement the reduction operation + /// across lanes in an OpenMP simd region or in a nested parallel region. + /// + /// void + /// dispersed_partial_reduce(void *reduce_data, + /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { + /// int size, remote_id; + /// int logical_lane_id = number_of_active_lanes_before_me() * 2; + /// do { + /// remote_id = next_active_lane_id_right_after_me(); + /// # the above function returns 0 of no active lane + /// # is present right after the current lane. + /// size = number_of_active_lanes_in_this_warp(); + /// logical_lane_id /= 2; + /// ShuffleReduceFn(reduce_data, logical_lane_id, + /// remote_id-1-threadIdx.x, 2); + /// } while (logical_lane_id % 2 == 0 && size > 1); + /// } + /// + /// There is no assumption made about the initial state of the reduction. + /// Any number of lanes (>=1) could be active at any position. The reduction + /// result is returned in the first active lane. + /// + /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: + /// + /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); + /// if (lane_id % 2 == 0 && offset > 0) + /// reduce_elem = reduce_elem REDUCE_OP remote_elem + /// else + /// reduce_elem = remote_elem + /// + /// + /// Intra-Team Reduction + /// + /// This function, as implemented in the runtime call + /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP + /// threads in a team. It first reduces within a warp using the + /// aforementioned algorithms. We then proceed to gather all such + /// reduced values at the first warp. + /// + /// The runtime makes use of the function 'InterWarpCpyFn', which copies + /// data from each of the "warp master" (zeroth lane of each warp, where + /// warp-reduced data is held) to the zeroth warp. This step reduces (in + /// a mathematical sense) the problem of reduction across warp masters in + /// a block to the problem of warp reduction. + /// + /// + /// Inter-Team Reduction + /// + /// Once a team has reduced its data to a single value, it is stored in + /// a global scratchpad array. Since each team has a distinct slot, this + /// can be done without locking. + /// + /// The last team to write to the scratchpad array proceeds to reduce the + /// scratchpad array. One or more workers in the last team use the helper + /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., + /// the k'th worker reduces every k'th element. + /// + /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to + /// reduce across workers and compute a globally reduced value. + /// + /// \param Loc The location where the reduction was + /// encountered. Must be within the associate + /// directive and after the last local access to the + /// reduction variables. + /// \param AllocaIP An insertion point suitable for allocas usable + /// in reductions. + /// \param CodeGenIP An insertion point suitable for code + /// generation. \param ReductionInfos A list of info on each reduction + /// variable. \param IsNoWait Optional flag set if the reduction is + /// marked as + /// nowait. + /// \param IsTeamsReduction Optional flag set if it is a teams + /// reduction. + /// \param HasDistribute Optional flag set if it is a + /// distribute reduction. + /// \param GridValue Optional GPU grid value. + /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be + /// used for teams reduction. + /// \param SrcLocInfo Source location information global. + InsertPointTy createReductionsGPU( + const LocationDescription &Loc, InsertPointTy AllocaIP, + InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, + bool IsNoWait = false, bool IsTeamsReduction = false, + bool HasDistribute = false, + ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, + std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024, + Value *SrcLocInfo = nullptr); + // TODO: provide atomic and non-atomic reduction generators for reduction // operators defined by the OpenMP specification. @@ -1421,19 +1993,6 @@ public: Value *NumThreads, Value *HostPtr, ArrayRef<Value *> KernelArgs); - /// Generate a barrier runtime call. - /// - /// \param Loc The location at which the request originated and is fulfilled. - /// \param DK The directive which caused the barrier - /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier. - /// \param CheckCancelFlag Flag to indicate a cancel barrier return value - /// should be checked and acted upon. - /// - /// \returns The insertion point after the barrier. - InsertPointTy emitBarrierImpl(const LocationDescription &Loc, - omp::Directive DK, bool ForceSimpleCall, - bool CheckCancelFlag); - /// Generate a flush runtime call. /// /// \param Loc The location at which the request originated and is fulfilled. diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 5154c33..3b29bab 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -795,6 +795,12 @@ void OpenMPIRBuilder::finalize(Function *Fn) { if (!OffloadInfoManager.empty()) createOffloadEntriesAndInfoMetadata(ErrorReportFn); + + if (Config.EmitLLVMUsedMetaInfo.value_or(false)) { + std::vector<WeakTrackingVH> LLVMCompilerUsed = { + M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")}; + emitUsed("llvm.compiler.used", LLVMCompilerUsed); + } } OpenMPIRBuilder::~OpenMPIRBuilder() { @@ -927,16 +933,11 @@ Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) { } OpenMPIRBuilder::InsertPointTy -OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK, +OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind, bool ForceSimpleCall, bool CheckCancelFlag) { if (!updateToLocation(Loc)) return Loc.IP; - return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag); -} -OpenMPIRBuilder::InsertPointTy -OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind, - bool ForceSimpleCall, bool CheckCancelFlag) { // Build call __kmpc_cancel_barrier(loc, thread_id) or // __kmpc_barrier(loc, thread_id); @@ -2117,15 +2118,1396 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc, /*IsCancellable*/ true); } -/// Create a function with a unique name and a "void (i8*, i8*)" signature in -/// the given module and return it. -Function *getFreshReductionFunc(Module &M) { +static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) { + BasicBlock::iterator IT(I); + IT++; + return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT); +} + +void OpenMPIRBuilder::emitUsed(StringRef Name, + std::vector<WeakTrackingVH> &List) { + if (List.empty()) + return; + + // Convert List to what ConstantArray needs. + SmallVector<Constant *, 8> UsedArray; + UsedArray.resize(List.size()); + for (unsigned I = 0, E = List.size(); I != E; ++I) + UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(&*List[I]), Builder.getPtrTy()); + + if (UsedArray.empty()) + return; + ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size()); + + auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage, + ConstantArray::get(ATy, UsedArray), Name); + + GV->setSection("llvm.metadata"); +} + +Value *OpenMPIRBuilder::getGPUThreadID() { + return Builder.CreateCall( + getOrCreateRuntimeFunction(M, + OMPRTL___kmpc_get_hardware_thread_id_in_block), + {}); +} + +Value *OpenMPIRBuilder::getGPUWarpSize() { + return Builder.CreateCall( + getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {}); +} + +Value *OpenMPIRBuilder::getNVPTXWarpID() { + unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size); + return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id"); +} + +Value *OpenMPIRBuilder::getNVPTXLaneID() { + unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size); + assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device."); + unsigned LaneIDMask = ~0u >> (32u - LaneIDBits); + return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask), + "nvptx_lane_id"); +} + +Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From, + Type *ToType) { + Type *FromType = From->getType(); + uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType); + uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType); + assert(FromSize > 0 && "From size must be greater than zero"); + assert(ToSize > 0 && "To size must be greater than zero"); + if (FromType == ToType) + return From; + if (FromSize == ToSize) + return Builder.CreateBitCast(From, ToType); + if (ToType->isIntegerTy() && FromType->isIntegerTy()) + return Builder.CreateIntCast(From, ToType, /*isSigned*/ true); + InsertPointTy SaveIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + Value *CastItem = Builder.CreateAlloca(ToType); + Builder.restoreIP(SaveIP); + + Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast( + CastItem, FromType->getPointerTo()); + Builder.CreateStore(From, ValCastItem); + return Builder.CreateLoad(ToType, CastItem); +} + +Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP, + Value *Element, + Type *ElementType, + Value *Offset) { + uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType); + assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction"); + + // Cast all types to 32- or 64-bit values before calling shuffle routines. + Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64); + Value *ElemCast = castValueToType(AllocaIP, Element, CastTy); + Value *WarpSize = + Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true); + Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr( + Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32 + : RuntimeFunction::OMPRTL___kmpc_shuffle_int64); + Value *WarpSizeCast = + Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true); + Value *ShuffleCall = + Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast}); + return castValueToType(AllocaIP, ShuffleCall, CastTy); +} + +void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, + Value *DstAddr, Type *ElemType, + Value *Offset, Type *ReductionArrayTy) { + uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType); + // Create the loop over the big sized data. + // ptr = (void*)Elem; + // ptrEnd = (void*) Elem + 1; + // Step = 8; + // while (ptr + Step < ptrEnd) + // shuffle((int64_t)*ptr); + // Step = 4; + // while (ptr + Step < ptrEnd) + // shuffle((int32_t)*ptr); + // ... + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + Value *ElemPtr = DstAddr; + Value *Ptr = SrcAddr; + for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) { + if (Size < IntSize) + continue; + Type *IntType = Builder.getIntNTy(IntSize * 8); + Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast( + Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast"); + Value *SrcAddrGEP = + Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)}); + ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast"); + + Function *CurFunc = Builder.GetInsertBlock()->getParent(); + if ((Size / IntSize) > 1) { + Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast( + SrcAddrGEP, Builder.getPtrTy()); + BasicBlock *PreCondBB = + BasicBlock::Create(M.getContext(), ".shuffle.pre_cond"); + BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then"); + BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit"); + BasicBlock *CurrentBB = Builder.GetInsertBlock(); + emitBlock(PreCondBB, CurFunc); + PHINode *PhiSrc = + Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2); + PhiSrc->addIncoming(Ptr, CurrentBB); + PHINode *PhiDest = + Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2); + PhiDest->addIncoming(ElemPtr, CurrentBB); + Ptr = PhiSrc; + ElemPtr = PhiDest; + Value *PtrDiff = Builder.CreatePtrDiff( + Builder.getInt8Ty(), PtrEnd, + Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy())); + Builder.CreateCondBr( + Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB, + ExitBB); + emitBlock(ThenBB, CurFunc); + Value *Res = createRuntimeShuffleFunction( + AllocaIP, + Builder.CreateAlignedLoad( + IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)), + IntType, Offset); + Builder.CreateAlignedStore(Res, ElemPtr, + M.getDataLayout().getPrefTypeAlign(ElemType)); + Value *LocalPtr = + Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)}); + Value *LocalElemPtr = + Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)}); + PhiSrc->addIncoming(LocalPtr, ThenBB); + PhiDest->addIncoming(LocalElemPtr, ThenBB); + emitBranch(PreCondBB); + emitBlock(ExitBB, CurFunc); + } else { + Value *Res = createRuntimeShuffleFunction( + AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset); + if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() < + Res->getType()->getScalarSizeInBits()) + Res = Builder.CreateTrunc(Res, ElemType); + Builder.CreateStore(Res, ElemPtr); + Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)}); + ElemPtr = + Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)}); + } + Size = Size % IntSize; + } +} + +void OpenMPIRBuilder::emitReductionListCopy( + InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, + ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, + CopyOptionsTy CopyOptions) { + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; + + // Iterates, element-by-element, through the source Reduce list and + // make a copy. + for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Value *SrcElementAddr = nullptr; + Value *DestElementAddr = nullptr; + Value *DestElementPtrAddr = nullptr; + // Should we shuffle in an element from a remote lane? + bool ShuffleInElement = false; + // Set to true to update the pointer in the dest Reduce list to a + // newly created element. + bool UpdateDestListPtr = false; + + // Step 1.1: Get the address for the src element in the Reduce list. + Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP( + ReductionArrayTy, SrcBase, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr); + + // Step 1.2: Create a temporary to store the element in the destination + // Reduce list. + DestElementPtrAddr = Builder.CreateInBoundsGEP( + ReductionArrayTy, DestBase, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + switch (Action) { + case CopyAction::RemoteLaneToThread: { + InsertPointTy CurIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr, + ".omp.reduction.element"); + DestAlloca->setAlignment( + M.getDataLayout().getPrefTypeAlign(RI.ElementType)); + DestElementAddr = DestAlloca; + DestElementAddr = + Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(), + DestElementAddr->getName() + ".ascast"); + Builder.restoreIP(CurIP); + ShuffleInElement = true; + UpdateDestListPtr = true; + break; + } + case CopyAction::ThreadCopy: { + DestElementAddr = + Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr); + break; + } + } + + // Now that all active lanes have read the element in the + // Reduce list, shuffle over the value from the remote lane. + if (ShuffleInElement) { + shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType, + RemoteLaneOffset, ReductionArrayTy); + } else { + switch (RI.EvaluationKind) { + case EvalKind::Scalar: { + Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr); + // Store the source element value to the dest element address. + Builder.CreateStore(Elem, DestElementAddr); + break; + } + case EvalKind::Complex: { + Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, SrcElementAddr, 0, 0, ".realp"); + Value *SrcReal = Builder.CreateLoad( + RI.ElementType->getStructElementType(0), SrcRealPtr, ".real"); + Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, SrcElementAddr, 0, 1, ".imagp"); + Value *SrcImg = Builder.CreateLoad( + RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag"); + + Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, DestElementAddr, 0, 0, ".realp"); + Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, DestElementAddr, 0, 1, ".imagp"); + Builder.CreateStore(SrcReal, DestRealPtr); + Builder.CreateStore(SrcImg, DestImgPtr); + break; + } + case EvalKind::Aggregate: { + Value *SizeVal = Builder.getInt64( + M.getDataLayout().getTypeStoreSize(RI.ElementType)); + Builder.CreateMemCpy( + DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType), + SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType), + SizeVal, false); + break; + } + }; + } + + // Step 3.1: Modify reference in dest Reduce list as needed. + // Modifying the reference in Reduce list to point to the newly + // created element. The element is live in the current function + // scope and that of functions it invokes (i.e., reduce_function). + // RemoteReduceData[i] = (void*)&RemoteElem + if (UpdateDestListPtr) { + Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast( + DestElementAddr, Builder.getPtrTy(), + DestElementAddr->getName() + ".ascast"); + Builder.CreateStore(CastDestAddr, DestElementPtrAddr); + } + } +} + +Function *OpenMPIRBuilder::emitInterWarpCopyFunction( + const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos, + AttributeList FuncAttrs) { + InsertPointTy SavedIP = Builder.saveIP(); + LLVMContext &Ctx = M.getContext(); + FunctionType *FuncTy = FunctionType::get( + Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()}, + /* IsVarArg */ false); + Function *WcFunc = + Function::Create(FuncTy, GlobalVariable::InternalLinkage, + "_omp_reduction_inter_warp_copy_func", &M); + WcFunc->setAttributes(FuncAttrs); + WcFunc->addParamAttr(0, Attribute::NoUndef); + WcFunc->addParamAttr(1, Attribute::NoUndef); + BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc); + Builder.SetInsertPoint(EntryBB); + + // ReduceList: thread local Reduce list. + // At the stage of the computation when this function is called, partially + // aggregated values reside in the first lane of every active warp. + Argument *ReduceListArg = WcFunc->getArg(0); + // NumWarps: number of warps active in the parallel region. This could + // be smaller than 32 (max warps in a CTA) for partial block reduction. + Argument *NumWarpsArg = WcFunc->getArg(1); + + // This array is used as a medium to transfer, one reduce element at a time, + // the data from the first lane of every warp to lanes in the first warp + // in order to perform the final step of a reduction in a parallel region + // (reduction across warps). The array is placed in NVPTX __shared__ memory + // for reduced latency, as well as to have a distinct copy for concurrently + // executing target regions. The array is declared with common linkage so + // as to be shared across compilation units. + StringRef TransferMediumName = + "__openmp_nvptx_data_transfer_temporary_storage"; + GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName); + unsigned WarpSize = Config.getGridValue().GV_Warp_Size; + ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize); + if (!TransferMedium) { + TransferMedium = new GlobalVariable( + M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage, + UndefValue::get(ArrayTy), TransferMediumName, + /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal, + /*AddressSpace=*/3); + } + + // Get the CUDA thread id of the current OpenMP thread on the GPU. + Value *GPUThreadID = getGPUThreadID(); + // nvptx_lane_id = nvptx_id % warpsize + Value *LaneID = getNVPTXLaneID(); + // nvptx_warp_id = nvptx_id / warpsize + Value *WarpID = getNVPTXWarpID(); + + InsertPointTy AllocaIP = + InsertPointTy(Builder.GetInsertBlock(), + Builder.GetInsertBlock()->getFirstInsertionPt()); + Type *Arg0Type = ReduceListArg->getType(); + Type *Arg1Type = NumWarpsArg->getType(); + Builder.restoreIP(AllocaIP); + AllocaInst *ReduceListAlloca = Builder.CreateAlloca( + Arg0Type, nullptr, ReduceListArg->getName() + ".addr"); + AllocaInst *NumWarpsAlloca = + Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr"); + Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast"); + Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + NumWarpsAlloca, Arg1Type->getPointerTo(), + NumWarpsAlloca->getName() + ".ascast"); + Builder.CreateStore(ReduceListArg, ReduceListAddrCast); + Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast); + AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca); + InsertPointTy CodeGenIP = + getInsertPointAfterInstr(&Builder.GetInsertBlock()->back()); + Builder.restoreIP(CodeGenIP); + + Value *ReduceList = + Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast); + + for (auto En : enumerate(ReductionInfos)) { + // + // Warp master copies reduce element to transfer medium in __shared__ + // memory. + // + const ReductionInfo &RI = En.value(); + unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType); + for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) { + Type *CType = Builder.getIntNTy(TySize * 8); + + unsigned NumIters = RealTySize / TySize; + if (NumIters == 0) + continue; + Value *Cnt = nullptr; + Value *CntAddr = nullptr; + BasicBlock *PrecondBB = nullptr; + BasicBlock *ExitBB = nullptr; + if (NumIters > 1) { + CodeGenIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + CntAddr = + Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr"); + + CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(), + CntAddr->getName() + ".ascast"); + Builder.restoreIP(CodeGenIP); + Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()), + CntAddr, + /*Volatile=*/false); + PrecondBB = BasicBlock::Create(Ctx, "precond"); + ExitBB = BasicBlock::Create(Ctx, "exit"); + BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body"); + emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent()); + Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr, + /*Volatile=*/false); + Value *Cmp = Builder.CreateICmpULT( + Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters)); + Builder.CreateCondBr(Cmp, BodyBB, ExitBB); + emitBlock(BodyBB, Builder.GetInsertBlock()->getParent()); + } + + // kmpc_barrier. + createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), + omp::Directive::OMPD_unknown, + /* ForceSimpleCall */ false, + /* CheckCancelFlag */ true); + BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then"); + BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else"); + BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont"); + + // if (lane_id == 0) + Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master"); + Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); + emitBlock(ThenBB, Builder.GetInsertBlock()->getParent()); + + // Reduce element = LocalReduceList[i] + auto *RedListArrayTy = + ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + Value *ElemPtrPtr = + Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList, + {ConstantInt::get(IndexTy, 0), + ConstantInt::get(IndexTy, En.index())}); + // elemptr = ((CopyType*)(elemptrptr)) + I + Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); + if (NumIters > 1) + ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt); + + // Get pointer to location in transfer medium. + // MediumPtr = &medium[warp_id] + Value *MediumPtr = Builder.CreateInBoundsGEP( + ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID}); + // elem = *elemptr + //*MediumPtr = elem + Value *Elem = Builder.CreateLoad(CType, ElemPtr); + // Store the source element value to the dest element address. + Builder.CreateStore(Elem, MediumPtr, + /*IsVolatile*/ true); + Builder.CreateBr(MergeBB); + + // else + emitBlock(ElseBB, Builder.GetInsertBlock()->getParent()); + Builder.CreateBr(MergeBB); + + // endif + emitBlock(MergeBB, Builder.GetInsertBlock()->getParent()); + createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), + omp::Directive::OMPD_unknown, + /* ForceSimpleCall */ false, + /* CheckCancelFlag */ true); + + // Warp 0 copies reduce element from transfer medium + BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then"); + BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else"); + BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont"); + + Value *NumWarpsVal = + Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast); + // Up to 32 threads in warp 0 are active. + Value *IsActiveThread = + Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread"); + Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); + + emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent()); + + // SecMediumPtr = &medium[tid] + // SrcMediumVal = *SrcMediumPtr + Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP( + ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID}); + // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I + Value *TargetElemPtrPtr = + Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList, + {ConstantInt::get(IndexTy, 0), + ConstantInt::get(IndexTy, En.index())}); + Value *TargetElemPtrVal = + Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr); + Value *TargetElemPtr = TargetElemPtrVal; + if (NumIters > 1) + TargetElemPtr = + Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt); + + // *TargetElemPtr = SrcMediumVal; + Value *SrcMediumValue = + Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true); + Builder.CreateStore(SrcMediumValue, TargetElemPtr); + Builder.CreateBr(W0MergeBB); + + emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent()); + Builder.CreateBr(W0MergeBB); + + emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent()); + + if (NumIters > 1) { + Cnt = Builder.CreateNSWAdd( + Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1)); + Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false); + + auto *CurFn = Builder.GetInsertBlock()->getParent(); + emitBranch(PrecondBB); + emitBlock(ExitBB, CurFn); + } + RealTySize %= TySize; + } + } + + Builder.CreateRetVoid(); + Builder.restoreIP(SavedIP); + + return WcFunc; +} + +Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( + ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, + AttributeList FuncAttrs) { + LLVMContext &Ctx = M.getContext(); + FunctionType *FuncTy = + FunctionType::get(Builder.getVoidTy(), + {Builder.getPtrTy(), Builder.getInt16Ty(), + Builder.getInt16Ty(), Builder.getInt16Ty()}, + /* IsVarArg */ false); + Function *SarFunc = + Function::Create(FuncTy, GlobalVariable::InternalLinkage, + "_omp_reduction_shuffle_and_reduce_func", &M); + SarFunc->setAttributes(FuncAttrs); + SarFunc->addParamAttr(0, Attribute::NoUndef); + SarFunc->addParamAttr(1, Attribute::NoUndef); + SarFunc->addParamAttr(2, Attribute::NoUndef); + SarFunc->addParamAttr(3, Attribute::NoUndef); + SarFunc->addParamAttr(1, Attribute::SExt); + SarFunc->addParamAttr(2, Attribute::SExt); + SarFunc->addParamAttr(3, Attribute::SExt); + BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc); + Builder.SetInsertPoint(EntryBB); + + // Thread local Reduce list used to host the values of data to be reduced. + Argument *ReduceListArg = SarFunc->getArg(0); + // Current lane id; could be logical. + Argument *LaneIDArg = SarFunc->getArg(1); + // Offset of the remote source lane relative to the current lane. + Argument *RemoteLaneOffsetArg = SarFunc->getArg(2); + // Algorithm version. This is expected to be known at compile time. + Argument *AlgoVerArg = SarFunc->getArg(3); + + Type *ReduceListArgType = ReduceListArg->getType(); + Type *LaneIDArgType = LaneIDArg->getType(); + Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo(); + Value *ReduceListAlloca = Builder.CreateAlloca( + ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr"); + Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr, + LaneIDArg->getName() + ".addr"); + Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca( + LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr"); + Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr, + AlgoVerArg->getName() + ".addr"); + ArrayType *RedListArrayTy = + ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); + + // Create a local thread-private variable to host the Reduce list + // from a remote lane. + Instruction *RemoteReductionListAlloca = Builder.CreateAlloca( + RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list"); + + Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + ReduceListAlloca, ReduceListArgType, + ReduceListAlloca->getName() + ".ascast"); + Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast"); + Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + RemoteLaneOffsetAlloca, LaneIDArgPtrType, + RemoteLaneOffsetAlloca->getName() + ".ascast"); + Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast"); + Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + RemoteReductionListAlloca, Builder.getPtrTy(), + RemoteReductionListAlloca->getName() + ".ascast"); + + Builder.CreateStore(ReduceListArg, ReduceListAddrCast); + Builder.CreateStore(LaneIDArg, LaneIdAddrCast); + Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast); + Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast); + + Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast); + Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast); + Value *RemoteLaneOffset = + Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast); + Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast); + + InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca); + + // This loop iterates through the list of reduce elements and copies, + // element by element, from a remote lane in the warp to RemoteReduceList, + // hosted on the thread's stack. + emitReductionListCopy( + AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos, + ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr}); + + // The actions to be performed on the Remote Reduce list is dependent + // on the algorithm version. + // + // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && + // LaneId % 2 == 0 && Offset > 0): + // do the reduction value aggregation + // + // The thread local variable Reduce list is mutated in place to host the + // reduced data, which is the aggregated value produced from local and + // remote lanes. + // + // Note that AlgoVer is expected to be a constant integer known at compile + // time. + // When AlgoVer==0, the first conjunction evaluates to true, making + // the entire predicate true during compile time. + // When AlgoVer==1, the second conjunction has only the second part to be + // evaluated during runtime. Other conjunctions evaluates to false + // during compile time. + // When AlgoVer==2, the third conjunction has only the second part to be + // evaluated during runtime. Other conjunctions evaluates to false + // during compile time. + Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer); + Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1)); + Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset); + Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp); + Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2)); + Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1)); + Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1); + Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp); + Value *RemoteOffsetComp = + Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0)); + Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp); + Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1); + Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2); + + BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then"); + BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else"); + BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont"); + + Builder.CreateCondBr(CondReduce, ThenBB, ElseBB); + emitBlock(ThenBB, Builder.GetInsertBlock()->getParent()); + Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + ReduceList, Builder.getPtrTy()); + Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + RemoteListAddrCast, Builder.getPtrTy()); + Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr}) + ->addFnAttr(Attribute::NoUnwind); + Builder.CreateBr(MergeBB); + + emitBlock(ElseBB, Builder.GetInsertBlock()->getParent()); + Builder.CreateBr(MergeBB); + + emitBlock(MergeBB, Builder.GetInsertBlock()->getParent()); + + // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local + // Reduce list. + Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1)); + Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset); + Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset); + + BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then"); + BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else"); + BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont"); + Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); + + emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent()); + emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, + ReductionInfos, RemoteListAddrCast, ReduceList); + Builder.CreateBr(CpyMergeBB); + + emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent()); + Builder.CreateBr(CpyMergeBB); + + emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent()); + + Builder.CreateRetVoid(); + + return SarFunc; +} + +Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( + ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy, + AttributeList FuncAttrs) { + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + LLVMContext &Ctx = M.getContext(); + FunctionType *FuncTy = FunctionType::get( + Builder.getVoidTy(), + {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, + /* IsVarArg */ false); + Function *LtGCFunc = + Function::Create(FuncTy, GlobalVariable::InternalLinkage, + "_omp_reduction_list_to_global_copy_func", &M); + LtGCFunc->setAttributes(FuncAttrs); + LtGCFunc->addParamAttr(0, Attribute::NoUndef); + LtGCFunc->addParamAttr(1, Attribute::NoUndef); + LtGCFunc->addParamAttr(2, Attribute::NoUndef); + + BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); + Builder.SetInsertPoint(EntryBlock); + + // Buffer: global reduction buffer. + Argument *BufferArg = LtGCFunc->getArg(0); + // Idx: index of the buffer. + Argument *IdxArg = LtGCFunc->getArg(1); + // ReduceList: thread local Reduce list. + Argument *ReduceListArg = LtGCFunc->getArg(2); + + Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, + BufferArg->getName() + ".addr"); + Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, + IdxArg->getName() + ".addr"); + Value *ReduceListArgAlloca = Builder.CreateAlloca( + Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr"); + Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + BufferArgAlloca, Builder.getPtrTy(), + BufferArgAlloca->getName() + ".ascast"); + Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast"); + Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + ReduceListArgAlloca, Builder.getPtrTy(), + ReduceListArgAlloca->getName() + ".ascast"); + + Builder.CreateStore(BufferArg, BufferArgAddrCast); + Builder.CreateStore(IdxArg, IdxArgAddrCast); + Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast); + + Value *LocalReduceList = + Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast); + Value *BufferArgVal = + Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast); + Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)}; + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + auto *RedListArrayTy = + ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); + // Reduce element = LocalReduceList[i] + Value *ElemPtrPtr = Builder.CreateInBoundsGEP( + RedListArrayTy, LocalReduceList, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + // elemptr = ((CopyType*)(elemptrptr)) + I + Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); + + // Global = Buffer.VD[Idx]; + Value *BufferVD = + Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs); + Value *GlobVal = Builder.CreateConstInBoundsGEP2_32( + ReductionsBufferTy, BufferVD, 0, En.index()); + + switch (RI.EvaluationKind) { + case EvalKind::Scalar: { + Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr); + Builder.CreateStore(TargetElement, GlobVal); + break; + } + case EvalKind::Complex: { + Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, ElemPtr, 0, 0, ".realp"); + Value *SrcReal = Builder.CreateLoad( + RI.ElementType->getStructElementType(0), SrcRealPtr, ".real"); + Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, ElemPtr, 0, 1, ".imagp"); + Value *SrcImg = Builder.CreateLoad( + RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag"); + + Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, GlobVal, 0, 0, ".realp"); + Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, GlobVal, 0, 1, ".imagp"); + Builder.CreateStore(SrcReal, DestRealPtr); + Builder.CreateStore(SrcImg, DestImgPtr); + break; + } + case EvalKind::Aggregate: { + Value *SizeVal = + Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType)); + Builder.CreateMemCpy( + GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr, + M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false); + break; + } + } + } + + Builder.CreateRetVoid(); + Builder.restoreIP(OldIP); + return LtGCFunc; +} + +Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( + ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, + Type *ReductionsBufferTy, AttributeList FuncAttrs) { + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + LLVMContext &Ctx = M.getContext(); + FunctionType *FuncTy = FunctionType::get( + Builder.getVoidTy(), + {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, + /* IsVarArg */ false); + Function *LtGRFunc = + Function::Create(FuncTy, GlobalVariable::InternalLinkage, + "_omp_reduction_list_to_global_reduce_func", &M); + LtGRFunc->setAttributes(FuncAttrs); + LtGRFunc->addParamAttr(0, Attribute::NoUndef); + LtGRFunc->addParamAttr(1, Attribute::NoUndef); + LtGRFunc->addParamAttr(2, Attribute::NoUndef); + + BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); + Builder.SetInsertPoint(EntryBlock); + + // Buffer: global reduction buffer. + Argument *BufferArg = LtGRFunc->getArg(0); + // Idx: index of the buffer. + Argument *IdxArg = LtGRFunc->getArg(1); + // ReduceList: thread local Reduce list. + Argument *ReduceListArg = LtGRFunc->getArg(2); + + Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, + BufferArg->getName() + ".addr"); + Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, + IdxArg->getName() + ".addr"); + Value *ReduceListArgAlloca = Builder.CreateAlloca( + Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr"); + auto *RedListArrayTy = + ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); + + // 1. Build a list of reduction variables. + // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; + Value *LocalReduceList = + Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list"); + + Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + BufferArgAlloca, Builder.getPtrTy(), + BufferArgAlloca->getName() + ".ascast"); + Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast"); + Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + ReduceListArgAlloca, Builder.getPtrTy(), + ReduceListArgAlloca->getName() + ".ascast"); + Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + LocalReduceList, Builder.getPtrTy(), + LocalReduceList->getName() + ".ascast"); + + Builder.CreateStore(BufferArg, BufferArgAddrCast); + Builder.CreateStore(IdxArg, IdxArgAddrCast); + Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast); + + Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast); + Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)}; + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + for (auto En : enumerate(ReductionInfos)) { + Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP( + RedListArrayTy, LocalReduceListAddrCast, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + Value *BufferVD = + Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs); + // Global = Buffer.VD[Idx]; + Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32( + ReductionsBufferTy, BufferVD, 0, En.index()); + Builder.CreateStore(GlobValPtr, TargetElementPtrPtr); + } + + // Call reduce_function(GlobalReduceList, ReduceList) + Value *ReduceList = + Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast); + Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList}) + ->addFnAttr(Attribute::NoUnwind); + Builder.CreateRetVoid(); + Builder.restoreIP(OldIP); + return LtGRFunc; +} + +Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( + ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy, + AttributeList FuncAttrs) { + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + LLVMContext &Ctx = M.getContext(); + FunctionType *FuncTy = FunctionType::get( + Builder.getVoidTy(), + {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, + /* IsVarArg */ false); + Function *LtGCFunc = + Function::Create(FuncTy, GlobalVariable::InternalLinkage, + "_omp_reduction_global_to_list_copy_func", &M); + LtGCFunc->setAttributes(FuncAttrs); + LtGCFunc->addParamAttr(0, Attribute::NoUndef); + LtGCFunc->addParamAttr(1, Attribute::NoUndef); + LtGCFunc->addParamAttr(2, Attribute::NoUndef); + + BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); + Builder.SetInsertPoint(EntryBlock); + + // Buffer: global reduction buffer. + Argument *BufferArg = LtGCFunc->getArg(0); + // Idx: index of the buffer. + Argument *IdxArg = LtGCFunc->getArg(1); + // ReduceList: thread local Reduce list. + Argument *ReduceListArg = LtGCFunc->getArg(2); + + Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, + BufferArg->getName() + ".addr"); + Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, + IdxArg->getName() + ".addr"); + Value *ReduceListArgAlloca = Builder.CreateAlloca( + Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr"); + Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + BufferArgAlloca, Builder.getPtrTy(), + BufferArgAlloca->getName() + ".ascast"); + Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast"); + Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + ReduceListArgAlloca, Builder.getPtrTy(), + ReduceListArgAlloca->getName() + ".ascast"); + Builder.CreateStore(BufferArg, BufferArgAddrCast); + Builder.CreateStore(IdxArg, IdxArgAddrCast); + Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast); + + Value *LocalReduceList = + Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast); + Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast); + Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)}; + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + for (auto En : enumerate(ReductionInfos)) { + const OpenMPIRBuilder::ReductionInfo &RI = En.value(); + auto *RedListArrayTy = + ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); + // Reduce element = LocalReduceList[i] + Value *ElemPtrPtr = Builder.CreateInBoundsGEP( + RedListArrayTy, LocalReduceList, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + // elemptr = ((CopyType*)(elemptrptr)) + I + Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr); + // Global = Buffer.VD[Idx]; + Value *BufferVD = + Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs); + Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32( + ReductionsBufferTy, BufferVD, 0, En.index()); + + switch (RI.EvaluationKind) { + case EvalKind::Scalar: { + Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr); + Builder.CreateStore(TargetElement, ElemPtr); + break; + } + case EvalKind::Complex: { + Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, GlobValPtr, 0, 0, ".realp"); + Value *SrcReal = Builder.CreateLoad( + RI.ElementType->getStructElementType(0), SrcRealPtr, ".real"); + Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, GlobValPtr, 0, 1, ".imagp"); + Value *SrcImg = Builder.CreateLoad( + RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag"); + + Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, ElemPtr, 0, 0, ".realp"); + Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32( + RI.ElementType, ElemPtr, 0, 1, ".imagp"); + Builder.CreateStore(SrcReal, DestRealPtr); + Builder.CreateStore(SrcImg, DestImgPtr); + break; + } + case EvalKind::Aggregate: { + Value *SizeVal = + Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType)); + Builder.CreateMemCpy( + ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType), + GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType), + SizeVal, false); + break; + } + } + } + + Builder.CreateRetVoid(); + Builder.restoreIP(OldIP); + return LtGCFunc; +} + +Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( + ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn, + Type *ReductionsBufferTy, AttributeList FuncAttrs) { + OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + LLVMContext &Ctx = M.getContext(); + auto *FuncTy = FunctionType::get( + Builder.getVoidTy(), + {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, + /* IsVarArg */ false); + Function *LtGRFunc = + Function::Create(FuncTy, GlobalVariable::InternalLinkage, + "_omp_reduction_global_to_list_reduce_func", &M); + LtGRFunc->setAttributes(FuncAttrs); + LtGRFunc->addParamAttr(0, Attribute::NoUndef); + LtGRFunc->addParamAttr(1, Attribute::NoUndef); + LtGRFunc->addParamAttr(2, Attribute::NoUndef); + + BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); + Builder.SetInsertPoint(EntryBlock); + + // Buffer: global reduction buffer. + Argument *BufferArg = LtGRFunc->getArg(0); + // Idx: index of the buffer. + Argument *IdxArg = LtGRFunc->getArg(1); + // ReduceList: thread local Reduce list. + Argument *ReduceListArg = LtGRFunc->getArg(2); + + Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, + BufferArg->getName() + ".addr"); + Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, + IdxArg->getName() + ".addr"); + Value *ReduceListArgAlloca = Builder.CreateAlloca( + Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr"); + ArrayType *RedListArrayTy = + ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); + + // 1. Build a list of reduction variables. + // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; + Value *LocalReduceList = + Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list"); + + Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + BufferArgAlloca, Builder.getPtrTy(), + BufferArgAlloca->getName() + ".ascast"); + Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast"); + Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + ReduceListArgAlloca, Builder.getPtrTy(), + ReduceListArgAlloca->getName() + ".ascast"); + Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast( + LocalReduceList, Builder.getPtrTy(), + LocalReduceList->getName() + ".ascast"); + + Builder.CreateStore(BufferArg, BufferArgAddrCast); + Builder.CreateStore(IdxArg, IdxArgAddrCast); + Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast); + + Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast); + Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)}; + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + for (auto En : enumerate(ReductionInfos)) { + Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP( + RedListArrayTy, ReductionList, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + // Global = Buffer.VD[Idx]; + Value *BufferVD = + Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs); + Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32( + ReductionsBufferTy, BufferVD, 0, En.index()); + Builder.CreateStore(GlobValPtr, TargetElementPtrPtr); + } + + // Call reduce_function(ReduceList, GlobalReduceList) + Value *ReduceList = + Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast); + Builder.CreateCall(ReduceFn, {ReduceList, ReductionList}) + ->addFnAttr(Attribute::NoUnwind); + Builder.CreateRetVoid(); + Builder.restoreIP(OldIP); + return LtGRFunc; +} + +std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const { + std::string Suffix = + createPlatformSpecificName({"omp", "reduction", "reduction_func"}); + return (Name + Suffix).str(); +} + +Function *OpenMPIRBuilder::createReductionFunction( + StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, + ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) { + auto *FuncTy = FunctionType::get(Builder.getVoidTy(), + {Builder.getPtrTy(), Builder.getPtrTy()}, + /* IsVarArg */ false); + std::string Name = getReductionFuncName(ReducerName); + Function *ReductionFunc = + Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M); + ReductionFunc->setAttributes(FuncAttrs); + ReductionFunc->addParamAttr(0, Attribute::NoUndef); + ReductionFunc->addParamAttr(1, Attribute::NoUndef); + BasicBlock *EntryBB = + BasicBlock::Create(M.getContext(), "entry", ReductionFunc); + Builder.SetInsertPoint(EntryBB); + + // Need to alloca memory here and deal with the pointers before getting + // LHS/RHS pointers out + Value *LHSArrayPtr = nullptr; + Value *RHSArrayPtr = nullptr; + Argument *Arg0 = ReductionFunc->getArg(0); + Argument *Arg1 = ReductionFunc->getArg(1); + Type *Arg0Type = Arg0->getType(); + Type *Arg1Type = Arg1->getType(); + + Value *LHSAlloca = + Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr"); + Value *RHSAlloca = + Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr"); + Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast"); + Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( + RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast"); + Builder.CreateStore(Arg0, LHSAddrCast); + Builder.CreateStore(Arg1, RHSAddrCast); + LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast); + RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast); + + Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size()); + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + SmallVector<Value *> LHSPtrs, RHSPtrs; + for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP( + RedArrayTy, RHSArrayPtr, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); + Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + RHSI8Ptr, RI.PrivateVariable->getType(), + RHSI8Ptr->getName() + ".ascast"); + + Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP( + RedArrayTy, LHSArrayPtr, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); + Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast"); + + if (ReductionGenCBKind == ReductionGenCBKind::Clang) { + LHSPtrs.emplace_back(LHSPtr); + RHSPtrs.emplace_back(RHSPtr); + } else { + Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); + Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + Value *Reduced; + RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); + if (!Builder.GetInsertBlock()) + return ReductionFunc; + Builder.CreateStore(Reduced, LHSPtr); + } + } + + if (ReductionGenCBKind == ReductionGenCBKind::Clang) + for (auto En : enumerate(ReductionInfos)) { + unsigned Index = En.index(); + const ReductionInfo &RI = En.value(); + Value *LHSFixupPtr, *RHSFixupPtr; + Builder.restoreIP(RI.ReductionGenClang( + Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc)); + + // Fix the CallBack code genereated to use the correct Values for the LHS + // and RHS + LHSFixupPtr->replaceUsesWithIf( + LHSPtrs[Index], [ReductionFunc](const Use &U) { + return cast<Instruction>(U.getUser())->getParent()->getParent() == + ReductionFunc; + }); + RHSFixupPtr->replaceUsesWithIf( + RHSPtrs[Index], [ReductionFunc](const Use &U) { + return cast<Instruction>(U.getUser())->getParent()->getParent() == + ReductionFunc; + }); + } + + Builder.CreateRetVoid(); + return ReductionFunc; +} + +static void +checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, + bool IsGPU) { + for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) { + (void)RI; + assert(RI.Variable && "expected non-null variable"); + assert(RI.PrivateVariable && "expected non-null private variable"); + assert((RI.ReductionGen || RI.ReductionGenClang) && + "expected non-null reduction generator callback"); + if (!IsGPU) { + assert( + RI.Variable->getType() == RI.PrivateVariable->getType() && + "expected variables and their private equivalents to have the same " + "type"); + } + assert(RI.Variable->getType()->isPointerTy() && + "expected variables to be pointers"); + } +} + +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU( + const LocationDescription &Loc, InsertPointTy AllocaIP, + InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, + bool IsNoWait, bool IsTeamsReduction, bool HasDistribute, + ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue, + unsigned ReductionBufNum, Value *SrcLocInfo) { + if (!updateToLocation(Loc)) + return InsertPointTy(); + Builder.restoreIP(CodeGenIP); + checkReductionInfos(ReductionInfos, /*IsGPU*/ true); + LLVMContext &Ctx = M.getContext(); + + // Source location for the ident struct + if (!SrcLocInfo) { + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + } + + if (ReductionInfos.size() == 0) + return Builder.saveIP(); + + Function *CurFunc = Builder.GetInsertBlock()->getParent(); + AttributeList FuncAttrs; + AttrBuilder AttrBldr(Ctx); + for (auto Attr : CurFunc->getAttributes().getFnAttrs()) + AttrBldr.addAttribute(Attr); + AttrBldr.removeAttribute(Attribute::OptimizeNone); + FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr); + + Function *ReductionFunc = nullptr; + CodeGenIP = Builder.saveIP(); + ReductionFunc = + createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(), + ReductionInfos, ReductionGenCBKind, FuncAttrs); + Builder.restoreIP(CodeGenIP); + + // Set the grid value in the config needed for lowering later on + if (GridValue.has_value()) + Config.setGridValue(GridValue.value()); + else + Config.setGridValue(getGridValue(T, ReductionFunc)); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize); + Value *RTLoc = + getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0); + + // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList), + // RedList, shuffle_reduce_func, interwarp_copy_func); + // or + // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>); + Value *Res; + + // 1. Build a list of reduction variables. + // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; + auto Size = ReductionInfos.size(); + Type *PtrTy = PointerType::getUnqual(Ctx); + Type *RedArrayTy = ArrayType::get(PtrTy, Size); + CodeGenIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + Value *ReductionListAlloca = + Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list"); + Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast( + ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast"); + Builder.restoreIP(CodeGenIP); + Type *IndexTy = Builder.getIndexTy( + M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); + for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Value *ElemPtr = Builder.CreateInBoundsGEP( + RedArrayTy, ReductionList, + {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); + Value *CastElem = + Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); + Builder.CreateStore(CastElem, ElemPtr); + } + CodeGenIP = Builder.saveIP(); + Function *SarFunc = + emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs); + Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs); + Builder.restoreIP(CodeGenIP); + + Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy); + + unsigned MaxDataSize = 0; + SmallVector<Type *> ReductionTypeArgs; + for (auto En : enumerate(ReductionInfos)) { + auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType); + if (Size > MaxDataSize) + MaxDataSize = Size; + ReductionTypeArgs.emplace_back(En.value().ElementType); + } + Value *ReductionDataSize = + Builder.getInt64(MaxDataSize * ReductionInfos.size()); + if (!IsTeamsReduction) { + Value *SarFuncCast = + Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy); + Value *WcFuncCast = + Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy); + Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast}; + Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr( + RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2); + Res = Builder.CreateCall(Pv2Ptr, Args); + } else { + CodeGenIP = Builder.saveIP(); + StructType *ReductionsBufferTy = StructType::create( + Ctx, ReductionTypeArgs, "struct._globalized_locals_ty"); + Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr( + RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer); + Function *LtGCFunc = emitListToGlobalCopyFunction( + ReductionInfos, ReductionsBufferTy, FuncAttrs); + Function *LtGRFunc = emitListToGlobalReduceFunction( + ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs); + Function *GtLCFunc = emitGlobalToListCopyFunction( + ReductionInfos, ReductionsBufferTy, FuncAttrs); + Function *GtLRFunc = emitGlobalToListReduceFunction( + ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs); + Builder.restoreIP(CodeGenIP); + + Value *KernelTeamsReductionPtr = Builder.CreateCall( + RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr"); + + Value *Args3[] = {RTLoc, + KernelTeamsReductionPtr, + Builder.getInt32(ReductionBufNum), + ReductionDataSize, + RL, + SarFunc, + WcFunc, + LtGCFunc, + LtGRFunc, + GtLCFunc, + GtLRFunc}; + + Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr( + RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2); + Res = Builder.CreateCall(TeamsReduceFn, Args3); + } + + // 5. Build if (res == 1) + BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done"); + BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then"); + Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1)); + Builder.CreateCondBr(Cond, ThenBB, ExitBB); + + // 6. Build then branch: where we have reduced values in the master + // thread in each team. + // __kmpc_end_reduce{_nowait}(<gtid>); + // break; + emitBlock(ThenBB, CurFunc); + + // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); + for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Value *LHS = RI.Variable; + Value *RHS = + Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy); + + if (ReductionGenCBKind == ReductionGenCBKind::Clang) { + Value *LHSPtr, *RHSPtr; + Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(), + &LHSPtr, &RHSPtr, CurFunc)); + + // Fix the CallBack code genereated to use the correct Values for the LHS + // and RHS + LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) { + return cast<Instruction>(U.getUser())->getParent()->getParent() == + ReductionFunc; + }); + RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) { + return cast<Instruction>(U.getUser())->getParent()->getParent() == + ReductionFunc; + }); + } else { + assert(false && "Unhandled ReductionGenCBKind"); + } + } + emitBlock(ExitBB, CurFunc); + + Config.setEmitLLVMUsed(); + + return Builder.saveIP(); +} + +static Function *getFreshReductionFunc(Module &M) { Type *VoidTy = Type::getVoidTy(M.getContext()); Type *Int8PtrTy = PointerType::getUnqual(M.getContext()); auto *FuncTy = FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false); return Function::Create(FuncTy, GlobalVariable::InternalLinkage, - M.getDataLayout().getDefaultGlobalsAddressSpace(), ".omp.reduction.func", &M); } @@ -2178,10 +3560,9 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc, Module *Module = Func->getParent(); uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); - bool CanGenerateAtomic = - llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) { - return RI.AtomicReductionGen; - }); + bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) { + return RI.AtomicReductionGen; + }); Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, CanGenerateAtomic ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE @@ -4632,7 +6013,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, Ident, DynamicEnvironment, }); - Twine KernelEnvironmentName = KernelName + "_kernel_environment"; + std::string KernelEnvironmentName = + (KernelName + "_kernel_environment").str(); GlobalVariable *KernelEnvironmentGV = new GlobalVariable( M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage, KernelEnvironmentInitializer, KernelEnvironmentName, diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 3ed3034..8653bbd 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -4980,8 +4980,14 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) { Builder.restoreIP(AfterIP); OpenMPIRBuilder::ReductionInfo ReductionInfos[] = { - {SumType, SumReduced, SumPrivatized, sumReduction, sumAtomicReduction}, - {XorType, XorReduced, XorPrivatized, xorReduction, xorAtomicReduction}}; + {SumType, SumReduced, SumPrivatized, + /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction, + /*ReductionGenClang=*/nullptr, sumAtomicReduction}, + {XorType, XorReduced, XorPrivatized, + /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, xorReduction, + /*ReductionGenClang=*/nullptr, xorAtomicReduction}}; + OMPBuilder.Config.setIsGPU(false); + bool ReduceVariableByRef[] = {false, false}; OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos, @@ -5232,15 +5238,20 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { /* NumThreads */ nullptr, OMP_PROC_BIND_default, /* IsCancellable */ false); + OMPBuilder.Config.setIsGPU(false); bool ReduceVariableByRef[] = {false}; OMPBuilder.createReductions( FirstBodyIP, FirstBodyAllocaIP, - {{SumType, SumReduced, SumPrivatized, sumReduction, sumAtomicReduction}}, + {{SumType, SumReduced, SumPrivatized, + /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, sumReduction, + /*ReductionGenClang=*/nullptr, sumAtomicReduction}}, ReduceVariableByRef); OMPBuilder.createReductions( SecondBodyIP, SecondBodyAllocaIP, - {{XorType, XorReduced, XorPrivatized, xorReduction, xorAtomicReduction}}, + {{XorType, XorReduced, XorPrivatized, + /*EvaluationKind=*/OpenMPIRBuilder::EvalKind::Scalar, xorReduction, + /*ReductionGenClang=*/nullptr, xorAtomicReduction}}, ReduceVariableByRef); Builder.restoreIP(AfterIP); |