diff options
author | Alexey Bataev <a.bataev@hotmail.com> | 2018-11-02 14:54:07 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@hotmail.com> | 2018-11-02 14:54:07 +0000 |
commit | e40901806fec4ea87e00cd408f8ca31c5868de0c (patch) | |
tree | c95f4e8bcc44e5c9343a630516a41b9f4317e9a0 /clang/lib | |
parent | c8325b4b597aceb07edf5b3caefdebc59a263683 (diff) | |
download | llvm-e40901806fec4ea87e00cd408f8ca31c5868de0c.zip llvm-e40901806fec4ea87e00cd408f8ca31c5868de0c.tar.gz llvm-e40901806fec4ea87e00cd408f8ca31c5868de0c.tar.bz2 |
[OPENMP][NVPTX]Improve emission of the globalized variables for
target/teams/distribute regions.
Target/teams/distribute regions exist for all the time the kernel is
executed. Thus, if the variable is declared in their context and then
escape it, we can allocate global memory statically instead of
allocating it dynamically.
Patch captures all the globalized variables in target/teams/distribute
contexts, merges them into the records, one per each target region.
Those records are then joined into the union, one per compilation unit
(to save the global memory). Those units are organized into
2 x dimensional arrays, where the first dimension is
the number of blocks per SM and the second one is the number of SMs.
Runtime functions manage this global memory space between the executing
teams.
llvm-svn: 345978
Diffstat (limited to 'clang/lib')
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 265 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 18 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains/Clang.cpp | 2 | ||||
-rw-r--r-- | clang/lib/Frontend/CompilerInvocation.cpp | 8 |
4 files changed, 285 insertions, 8 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index e93b73b..8daaf4c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -17,6 +17,7 @@ #include "clang/AST/DeclOpenMP.h" #include "clang/AST/StmtOpenMP.h" #include "clang/AST/StmtVisitor.h" +#include "clang/Basic/Cuda.h" #include "llvm/ADT/SmallPtrSet.h" using namespace clang; @@ -100,6 +101,11 @@ enum OpenMPRTLFunctionNVPTX { OMPRTL_NVPTX__kmpc_parallel_level, /// Call to int8_t __kmpc_is_spmd_exec_mode(); OMPRTL_NVPTX__kmpc_is_spmd_exec_mode, + /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size, + /// int16_t is_shared, const void **res); + OMPRTL_NVPTX__kmpc_get_team_static_memory, + /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared); + OMPRTL_NVPTX__kmpc_restore_team_static_memory, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -1120,19 +1126,39 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) : EST(EST), WST(WST) {} void Enter(CodeGenFunction &CGF) override { - auto &RT = static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()); + auto &RT = + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()); RT.emitNonSPMDEntryHeader(CGF, EST, WST); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } void Exit(CodeGenFunction &CGF) override { - auto &RT = static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()); + auto &RT = + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()); RT.clearLocThreadIdInsertPt(CGF); RT.emitNonSPMDEntryFooter(CGF, EST); } } Action(EST, WST); CodeGen.setAction(Action); IsInTTDRegion = true; + // Reserve place for the globalized memory. + GlobalizedRecords.emplace_back(); + if (!StaticGlobalized) { + StaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, nullptr, + "_openmp_static_glob_rd$ptr"); + StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + } + if (!KernelStaticGlobalized) { + KernelStaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(CGM.VoidPtrTy), + "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1249,6 +1275,24 @@ void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D, } Action(*this, EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; + // Reserve place for the globalized memory. + GlobalizedRecords.emplace_back(); + if (!StaticGlobalized) { + StaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, nullptr, + "_openmp_static_glob_rd$ptr"); + StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + } + if (!KernelStaticGlobalized) { + KernelStaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(CGM.VoidPtrTy), + "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1758,6 +1802,24 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode"); break; } + case OMPRTL_NVPTX__kmpc_get_team_static_memory: { + // Build void __kmpc_get_team_static_memory(const void *buf, size_t size, + // int16_t is_shared, const void **res); + llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty, + CGM.VoidPtrPtrTy}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory"); + break; + } + case OMPRTL_NVPTX__kmpc_restore_team_static_memory: { + // Build void __kmpc_restore_team_static_memory(int16_t is_shared); + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false); + RTLFn = + CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory"); + break; + } } return RTLFn; } @@ -1995,8 +2057,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); llvm::Value *GlobalRecCastAddr; llvm::Value *IsTTD = nullptr; - if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + if (!IsInTTDRegion && + (WithSPMDCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); @@ -2056,6 +2119,63 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, GlobalRecCastAddr = Phi; I->getSecond().GlobalRecordAddr = Phi; I->getSecond().IsInSPMDModeFlag = IsSPMD; + } else if (IsInTTDRegion) { + assert(GlobalizedRecords.back().Records.size() < 2 && + "Expected less than 2 globalized records: one for target and one " + "for teams."); + unsigned Offset = 0; + for (const RecordDecl *RD : GlobalizedRecords.back().Records) { + QualType RDTy = CGM.getContext().getRecordType(RD); + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); + unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); + Offset = + llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); + } + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); + Offset = llvm::alignTo(Offset, Alignment); + GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); + ++GlobalizedRecords.back().RegionCounter; + if (GlobalizedRecords.back().Records.size() == 1) { + assert(StaticGlobalized && + "Static pointer must be initialized already."); + Address Buffer = CGF.EmitLoadOfPointer( + Address(StaticGlobalized, CGM.getPointerAlign()), + CGM.getContext() + .getPointerType(CGM.getContext().VoidPtrTy) + .castAs<PointerType>()); + auto *RecSize = new llvm::GlobalVariable( + CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, + llvm::GlobalValue::InternalLinkage, nullptr, + "_openmp_static_kernel$size"); + RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + llvm::Value *Ld = CGF.EmitLoadOfScalar( + Address(RecSize, CGM.getPointerAlign()), /*Volatile=*/false, + CGM.getContext().getSizeType(), Loc); + llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + KernelStaticGlobalized, CGM.VoidPtrPtrTy); + llvm::Value *GlobalRecordSizeArg[] = { + Buffer.getPointer(), Ld, + llvm::ConstantInt::getNullValue(CGM.Int16Ty), ResAddr}; + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_get_team_static_memory), + GlobalRecordSizeArg); + GlobalizedRecords.back().RecSize = RecSize; + } + assert(KernelStaticGlobalized && "Global address must be set already."); + Address FrameAddr = CGF.EmitLoadOfPointer( + Address(KernelStaticGlobalized, CGM.getPointerAlign()), + CGM.getContext() + .getPointerType(CGM.getContext().VoidPtrTy) + .castAs<PointerType>()); + llvm::Value *GlobalRecValue = + Bld.CreateConstInBoundsGEP(FrameAddr, Offset, CharUnits::One()) + .getPointer(); + I->getSecond().GlobalRecordAddr = GlobalRecValue; + I->getSecond().IsInSPMDModeFlag = nullptr; + GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); } else { // TODO: allow the usage of shared memory to be controlled by // the user, for now, default to global. @@ -2112,8 +2232,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, AlignmentSource::Decl); } Rec.second.PrivateAddr = VarAddr.getAddress(); - if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + if (!IsInTTDRegion && + (WithSPMDCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { assert(I->getSecond().IsInSPMDModeFlag && "Expected unknown execution mode or required SPMD check."); if (IsTTD) { @@ -2193,8 +2314,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, Addr); } if (I->getSecond().GlobalRecordAddr) { - if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + if (!IsInTTDRegion && + (WithSPMDCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { CGBuilderTy &Bld = CGF.Builder; llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); @@ -2207,6 +2329,17 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); CGF.EmitBlock(ExitBB); + } else if (IsInTTDRegion) { + assert(GlobalizedRecords.back().RegionCounter > 0 && + "region counter must be > 0."); + --GlobalizedRecords.back().RegionCounter; + // Emit the restore function only in the target region. + if (GlobalizedRecords.back().RegionCounter == 0) { + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_restore_team_static_memory), + llvm::ConstantInt::getNullValue(CGM.Int16Ty)); + } } else { CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), @@ -4308,3 +4441,119 @@ void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas( } } +/// Get number of SMs and number of blocks per SM. +static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) { + std::pair<unsigned, unsigned> Data; + if (CGM.getLangOpts().OpenMPCUDANumSMs) + Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; + if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) + Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; + if (Data.first && Data.second) + return Data; + if (CGM.getTarget().hasFeature("ptx")) { + llvm::StringMap<bool> Features; + CGM.getTarget().initFeatureMap(Features, CGM.getDiags(), + CGM.getTarget().getTargetOpts().CPU, + CGM.getTarget().getTargetOpts().Features); + for (const auto &Feature : Features) { + if (Feature.getValue()) { + switch (StringToCudaArch(Feature.getKey())) { + case CudaArch::SM_20: + case CudaArch::SM_21: + case CudaArch::SM_30: + case CudaArch::SM_32: + case CudaArch::SM_35: + case CudaArch::SM_37: + case CudaArch::SM_50: + case CudaArch::SM_52: + case CudaArch::SM_53: + return {16, 16}; + case CudaArch::SM_60: + case CudaArch::SM_61: + case CudaArch::SM_62: + return {56, 32}; + case CudaArch::SM_70: + case CudaArch::SM_72: + case CudaArch::SM_75: + return {84, 32}; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: + case CudaArch::GFX904: + case CudaArch::GFX906: + case CudaArch::GFX909: + case CudaArch::UNKNOWN: + break; + case CudaArch::LAST: + llvm_unreachable("Unexpected Cuda arch."); + } + } + } + } + llvm_unreachable("Unexpected NVPTX target without ptx feature."); +} + +void CGOpenMPRuntimeNVPTX::clear() { + if (!GlobalizedRecords.empty()) { + ASTContext &C = CGM.getContext(); + RecordDecl *StaticRD = C.buildImplicitRecord( + "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); + StaticRD->startDefinition(); + for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { + if (Records.Records.empty()) + continue; + unsigned Size = 0; + unsigned RecAlignment = 0; + for (const RecordDecl *RD : Records.Records) { + QualType RDTy = CGM.getContext().getRecordType(RD); + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); + RecAlignment = std::max(RecAlignment, Alignment); + unsigned RecSize = + CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); + Size = + llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); + } + Size = llvm::alignTo(Size, RecAlignment); + llvm::APInt ArySize(/*numBits=*/64, Size); + QualType SubTy = C.getConstantArrayType( + C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0); + auto *Field = FieldDecl::Create( + C, StaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, + C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + StaticRD->addDecl(Field); + Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); + } + StaticRD->completeDefinition(); + QualType StaticTy = C.getRecordType(StaticRD); + std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM); + llvm::APInt Size1(32, SMsBlockPerSM.second); + QualType Arr1Ty = C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal, + /*IndexTypeQuals=*/0); + llvm::APInt Size2(32, SMsBlockPerSM.first); + QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal, + /*IndexTypeQuals=*/0); + llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), LLVMArr2Ty, + /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_"); + StaticGlobalized->setInitializer( + llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, + CGM.VoidPtrTy)); + } + CGOpenMPRuntime::clear(); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index b0b66d8..dabc6b4 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -182,6 +182,7 @@ protected: public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); + void clear() override; /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. @@ -413,6 +414,23 @@ private: /// Maps the function to the list of the globalized variables with their /// addresses. llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls; + /// List of records for the globalized variables in target/teams/distribute + /// contexts. Inner records are going to be joined into the single record, + /// while those resulting records are going to be joined into the single + /// union. This resulting union (one per CU) is the entry point for the static + /// memory management runtime functions. + struct GlobalPtrSizeRecsTy { + llvm::GlobalVariable *RecSize = nullptr; + llvm::SmallVector<const RecordDecl *, 2> Records; + unsigned RegionCounter = 0; + }; + llvm::SmallVector<GlobalPtrSizeRecsTy, 8> GlobalizedRecords; + /// Global variable used for staticlly allocated global memoryused for + /// globalization in target/teams/distribute regions. + llvm::GlobalVariable *StaticGlobalized = nullptr; + /// Shared pointer for the global memory in the global memory buffer used for + /// the given kernel. + llvm::GlobalVariable *KernelStaticGlobalized = nullptr; }; } // CodeGen namespace. diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a8ddd8a..41681e7 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4298,6 +4298,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd, options::OPT_fno_openmp_simd); Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ); + Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_number_of_sm_EQ); + Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ); // When in OpenMP offloading mode with NVPTX target, forward // cuda-mode flag diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 912afba..d78c133 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -2686,6 +2686,14 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, Opts.Exceptions = 0; Opts.CXXExceptions = 0; } + if (Opts.OpenMPIsDevice && T.isNVPTX()) { + Opts.OpenMPCUDANumSMs = + getLastArgIntValue(Args, options::OPT_fopenmp_cuda_number_of_sm_EQ, + Opts.OpenMPCUDANumSMs, Diags); + Opts.OpenMPCUDABlocksPerSM = + getLastArgIntValue(Args, options::OPT_fopenmp_cuda_blocks_per_sm_EQ, + Opts.OpenMPCUDABlocksPerSM, Diags); + } // Get the OpenMP target triples if any. if (Arg *A = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) { |