//===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // This file implements two passes that enable HIP C++ Standard Parallelism // Support: // // 1. AcceleratorCodeSelection (required): Given that only algorithms are // accelerated, and that the accelerated implementation exists in the form of // a compute kernel, we assume that only the kernel, and all functions // reachable from it, constitute code that the user expects the accelerator // to execute. Thus, we identify the set of all functions reachable from // kernels, and then remove all unreachable ones. This last part is necessary // because it is possible for code that the user did not expect to execute on // an accelerator to contain constructs that cannot be handled by the target // BE, which cannot be provably demonstrated to be dead code in general, and // thus can lead to mis-compilation. The degenerate case of this is when a // Module contains no kernels (the parent TU had no algorithm invocations fit // for acceleration), which we handle by completely emptying said module. // **NOTE**: The above does not handle indirectly reachable functions i.e. // it is possible to obtain a case where the target of an indirect // call is otherwise unreachable and thus is removed; this // restriction is aligned with the current `-hipstdpar` limitations // and will be relaxed in the future. // // 2. AllocationInterposition (required only when on-demand paging is // unsupported): Some accelerators or operating systems might not support // transparent on-demand paging. Thus, they would only be able to access // memory that is allocated by an accelerator-aware mechanism. For such cases // the user can opt into enabling allocation / deallocation interposition, // whereby we replace calls to known allocation / deallocation functions with // calls to runtime implemented equivalents that forward the requests to // accelerator-aware interfaces. We also support freeing system allocated // memory that ends up in one of the runtime equivalents, since this can // happen if e.g. a library that was compiled without interposition returns // an allocation that can be validly passed to `free`. // // 3. MathFixup (required): Some accelerators might have an incomplete // implementation for the intrinsics used to implement some of the math // functions in / their corresponding libcall lowerings. Since this // can vary quite significantly between accelerators, we replace calls to a // set of intrinsics / lib functions known to be problematic with calls to a // HIPSTDPAR specific forwarding layer, which gives an uniform interface for // accelerators to implement in their own runtime components. This pass // should run before AcceleratorCodeSelection so as to prevent the spurious // removal of the HIPSTDPAR specific forwarding functions. //===----------------------------------------------------------------------===// #include "llvm/Transforms/HipStdPar/HipStdPar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include #include using namespace llvm; template static inline void eraseFromModule(T &ToErase) { ToErase.replaceAllUsesWith(PoisonValue::get(ToErase.getType())); ToErase.eraseFromParent(); } static bool checkIfSupported(GlobalVariable &G) { if (!G.isThreadLocal()) return true; G.dropDroppableUses(); if (!G.isConstantUsed()) return true; std::string W; raw_string_ostream OS(W); OS << "Accelerator does not support the thread_local variable " << G.getName(); Instruction *I = nullptr; SmallVector Tmp(G.users()); SmallPtrSet Visited; do { auto U = std::move(Tmp.back()); Tmp.pop_back(); if (!Visited.insert(U).second) continue; if (isa(U)) I = cast(U); else Tmp.insert(Tmp.end(), U->user_begin(), U->user_end()); } while (!I && !Tmp.empty()); assert(I && "thread_local global should have at least one non-constant use."); G.getContext().diagnose( DiagnosticInfoUnsupported(*I->getParent()->getParent(), W, I->getDebugLoc(), DS_Error)); return false; } static inline void clearModule(Module &M) { // TODO: simplify. while (!M.functions().empty()) eraseFromModule(*M.begin()); while (!M.globals().empty()) eraseFromModule(*M.globals().begin()); while (!M.aliases().empty()) eraseFromModule(*M.aliases().begin()); while (!M.ifuncs().empty()) eraseFromModule(*M.ifuncs().begin()); } static SmallVector> collectIndirectableUses(GlobalVariable *G) { // We are interested only in use chains that end in an Instruction. SmallVector> Uses; SmallVector> Stack(G->use_begin(), G->use_end()); while (!Stack.empty()) { Use &U = Stack.pop_back_val(); if (isa(U.getUser())) Uses.emplace_back(U); else transform(U.getUser()->uses(), std::back_inserter(Stack), [](auto &&U) { return std::ref(U); }); } return Uses; } static inline GlobalVariable *getGlobalForName(GlobalVariable *G) { // Create an anonymous global which stores the variable's name, which will be // used by the HIPSTDPAR runtime to look up the program-wide symbol. LLVMContext &Ctx = G->getContext(); auto *CDS = ConstantDataArray::getString(Ctx, G->getName()); GlobalVariable *N = G->getParent()->getOrInsertGlobal("", CDS->getType()); N->setInitializer(CDS); N->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage); N->setConstant(true); return N; } static inline GlobalVariable *getIndirectionGlobal(Module *M) { // Create an anonymous global which stores a pointer to a pointer, which will // be externally initialised by the HIPSTDPAR runtime with the address of the // program-wide symbol. Type *PtrTy = PointerType::get( M->getContext(), M->getDataLayout().getDefaultGlobalsAddressSpace()); GlobalVariable *NewG = M->getOrInsertGlobal("", PtrTy); NewG->setInitializer(PoisonValue::get(NewG->getValueType())); NewG->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage); NewG->setConstant(true); NewG->setExternallyInitialized(true); return NewG; } static Constant * appendIndirectedGlobal(const GlobalVariable *IndirectionTable, SmallVector &SymbolIndirections, GlobalVariable *ToIndirect) { Module *M = ToIndirect->getParent(); auto *InitTy = cast(IndirectionTable->getValueType()); auto *SymbolListTy = cast(InitTy->getStructElementType(2)); Type *NameTy = SymbolListTy->getElementType(0); Type *IndirectTy = SymbolListTy->getElementType(1); Constant *NameG = getGlobalForName(ToIndirect); Constant *IndirectG = getIndirectionGlobal(M); Constant *Entry = ConstantStruct::get( SymbolListTy, {ConstantExpr::getAddrSpaceCast(NameG, NameTy), ConstantExpr::getAddrSpaceCast(IndirectG, IndirectTy)}); SymbolIndirections.push_back(Entry); return IndirectG; } static void fillIndirectionTable(GlobalVariable *IndirectionTable, SmallVector Indirections) { Module *M = IndirectionTable->getParent(); size_t SymCnt = Indirections.size(); auto *InitTy = cast(IndirectionTable->getValueType()); Type *SymbolListTy = InitTy->getStructElementType(1); auto *SymbolTy = cast(InitTy->getStructElementType(2)); Constant *Count = ConstantInt::get(InitTy->getStructElementType(0), SymCnt); M->removeGlobalVariable(IndirectionTable); GlobalVariable *Symbols = M->getOrInsertGlobal("", ArrayType::get(SymbolTy, SymCnt)); Symbols->setLinkage(GlobalValue::LinkageTypes::PrivateLinkage); Symbols->setInitializer( ConstantArray::get(ArrayType::get(SymbolTy, SymCnt), {Indirections})); Symbols->setConstant(true); Constant *ASCSymbols = ConstantExpr::getAddrSpaceCast(Symbols, SymbolListTy); Constant *Init = ConstantStruct::get( InitTy, {Count, ASCSymbols, PoisonValue::get(SymbolTy)}); M->insertGlobalVariable(IndirectionTable); IndirectionTable->setInitializer(Init); } static void replaceWithIndirectUse(const Use &U, const GlobalVariable *G, Constant *IndirectedG) { auto *I = cast(U.getUser()); IRBuilder<> Builder(I); unsigned OpIdx = U.getOperandNo(); Value *Op = I->getOperand(OpIdx); // We walk back up the use chain, which could be an arbitrarily long sequence // of constexpr AS casts, ptr-to-int and GEP instructions, until we reach the // indirected global. while (auto *CE = dyn_cast(Op)) { assert((CE->getOpcode() == Instruction::GetElementPtr || CE->getOpcode() == Instruction::AddrSpaceCast || CE->getOpcode() == Instruction::PtrToInt) && "Only GEP, ASCAST or PTRTOINT constant uses supported!"); Instruction *NewI = Builder.Insert(CE->getAsInstruction()); I->replaceUsesOfWith(Op, NewI); I = NewI; Op = I->getOperand(0); OpIdx = 0; Builder.SetInsertPoint(I); } assert(Op == G && "Must reach indirected global!"); I->setOperand(OpIdx, Builder.CreateLoad(G->getType(), IndirectedG)); } static inline bool isValidIndirectionTable(GlobalVariable *IndirectionTable) { std::string W; raw_string_ostream OS(W); Type *Ty = IndirectionTable->getValueType(); bool Valid = false; if (!isa(Ty)) { OS << "The Indirection Table must be a struct type; "; Ty->print(OS); OS << " is incorrect.\n"; } else if (cast(Ty)->getNumElements() != 3u) { OS << "The Indirection Table must have 3 elements; " << cast(Ty)->getNumElements() << " is incorrect.\n"; } else if (!isa(cast(Ty)->getStructElementType(0))) { OS << "The first element in the Indirection Table must be an integer; "; cast(Ty)->getStructElementType(0)->print(OS); OS << " is incorrect.\n"; } else if (!isa(cast(Ty)->getStructElementType(1))) { OS << "The second element in the Indirection Table must be a pointer; "; cast(Ty)->getStructElementType(1)->print(OS); OS << " is incorrect.\n"; } else if (!isa(cast(Ty)->getStructElementType(2))) { OS << "The third element in the Indirection Table must be a struct type; "; cast(Ty)->getStructElementType(2)->print(OS); OS << " is incorrect.\n"; } else { Valid = true; } if (!Valid) IndirectionTable->getContext().diagnose(DiagnosticInfoGeneric(W, DS_Error)); return Valid; } static void indirectGlobals(GlobalVariable *IndirectionTable, SmallVector ToIndirect) { // We replace globals with an indirected access via a pointer that will get // set by the HIPSTDPAR runtime, using their accessible, program-wide unique // address as set by the host linker-loader. SmallVector SymbolIndirections; for (auto &&G : ToIndirect) { SmallVector> Uses = collectIndirectableUses(G); if (Uses.empty()) continue; Constant *IndirectedGlobal = appendIndirectedGlobal(IndirectionTable, SymbolIndirections, G); for_each(Uses, [=](auto &&U) { replaceWithIndirectUse(U, G, IndirectedGlobal); }); eraseFromModule(*G); } if (SymbolIndirections.empty()) return; fillIndirectionTable(IndirectionTable, std::move(SymbolIndirections)); } static inline void maybeHandleGlobals(Module &M) { unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); SmallVector ToIndirect; for (auto &&G : M.globals()) { if (!checkIfSupported(G)) return clearModule(M); if (G.getAddressSpace() != GlobAS) continue; if (G.isConstant() && G.hasInitializer() && G.hasAtLeastLocalUnnamedAddr()) continue; ToIndirect.push_back(&G); } if (ToIndirect.empty()) return; if (auto *IT = M.getNamedGlobal("__hipstdpar_symbol_indirection_table")) { if (!isValidIndirectionTable(IT)) return clearModule(M); return indirectGlobals(IT, std::move(ToIndirect)); } else { for (auto &&G : ToIndirect) { // We will internalise these, so we provide a poison initialiser. if (!G->hasInitializer()) G->setInitializer(PoisonValue::get(G->getValueType())); } } } template static inline void removeUnreachableFunctions( const SmallPtrSet& Reachable, Module &M) { removeFromUsedLists(M, [&](Constant *C) { if (auto F = dyn_cast(C)) return !Reachable.contains(F); return false; }); SmallVector> ToRemove; copy_if(M, std::back_inserter(ToRemove), [&](auto &&F) { return !F.isIntrinsic() && !Reachable.contains(&F); }); for_each(ToRemove, eraseFromModule); } static inline bool isAcceleratorExecutionRoot(const Function *F) { if (!F) return false; return F->getCallingConv() == CallingConv::AMDGPU_KERNEL; } static inline bool checkIfSupported(const Function *F, const CallBase *CB) { const auto Dx = F->getName().rfind("__hipstdpar_unsupported"); if (Dx == StringRef::npos) return true; const auto N = F->getName().substr(0, Dx); std::string W; raw_string_ostream OS(W); if (N == "__ASM") OS << "Accelerator does not support the ASM block:\n" << cast(CB->getArgOperand(0))->getAsCString(); else OS << "Accelerator does not support the " << N << " function."; auto Caller = CB->getParent()->getParent(); Caller->getContext().diagnose( DiagnosticInfoUnsupported(*Caller, W, CB->getDebugLoc(), DS_Error)); return false; } PreservedAnalyses HipStdParAcceleratorCodeSelectionPass::run(Module &M, ModuleAnalysisManager &MAM) { auto &CGA = MAM.getResult(M); SmallPtrSet Reachable; for (auto &&CGN : CGA) { if (!isAcceleratorExecutionRoot(CGN.first)) continue; Reachable.insert(CGN.first); SmallVector Tmp({CGN.first}); do { auto F = std::move(Tmp.back()); Tmp.pop_back(); for (auto &&N : *CGA[F]) { if (!N.second) continue; if (!N.second->getFunction()) continue; if (Reachable.contains(N.second->getFunction())) continue; if (!checkIfSupported(N.second->getFunction(), dyn_cast(*N.first))) return PreservedAnalyses::none(); Reachable.insert(N.second->getFunction()); Tmp.push_back(N.second->getFunction()); } } while (!std::empty(Tmp)); } if (std::empty(Reachable)) clearModule(M); else removeUnreachableFunctions(Reachable, M); maybeHandleGlobals(M); return PreservedAnalyses::none(); } static constexpr std::pair ReplaceMap[]{ {"aligned_alloc", "__hipstdpar_aligned_alloc"}, {"calloc", "__hipstdpar_calloc"}, {"free", "__hipstdpar_free"}, {"malloc", "__hipstdpar_malloc"}, {"memalign", "__hipstdpar_aligned_alloc"}, {"mmap", "__hipstdpar_mmap"}, {"munmap", "__hipstdpar_munmap"}, {"posix_memalign", "__hipstdpar_posix_aligned_alloc"}, {"realloc", "__hipstdpar_realloc"}, {"reallocarray", "__hipstdpar_realloc_array"}, {"_ZdaPv", "__hipstdpar_operator_delete"}, {"_ZdaPvm", "__hipstdpar_operator_delete_sized"}, {"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"}, {"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"}, {"_ZdlPv", "__hipstdpar_operator_delete"}, {"_ZdlPvm", "__hipstdpar_operator_delete_sized"}, {"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"}, {"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"}, {"_Znam", "__hipstdpar_operator_new"}, {"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"}, {"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"}, {"_ZnamSt11align_val_tRKSt9nothrow_t", "__hipstdpar_operator_new_aligned_nothrow"}, {"_Znwm", "__hipstdpar_operator_new"}, {"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"}, {"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"}, {"_ZnwmSt11align_val_tRKSt9nothrow_t", "__hipstdpar_operator_new_aligned_nothrow"}, {"__builtin_calloc", "__hipstdpar_calloc"}, {"__builtin_free", "__hipstdpar_free"}, {"__builtin_malloc", "__hipstdpar_malloc"}, {"__builtin_operator_delete", "__hipstdpar_operator_delete"}, {"__builtin_operator_new", "__hipstdpar_operator_new"}, {"__builtin_realloc", "__hipstdpar_realloc"}, {"__libc_calloc", "__hipstdpar_calloc"}, {"__libc_free", "__hipstdpar_free"}, {"__libc_malloc", "__hipstdpar_malloc"}, {"__libc_memalign", "__hipstdpar_aligned_alloc"}, {"__libc_realloc", "__hipstdpar_realloc"}}; static constexpr std::pair HiddenMap[]{ // hidden_malloc and hidden_free are only kept for backwards compatibility / // legacy purposes, and we should remove them in the future {"__hipstdpar_hidden_malloc", "__libc_malloc"}, {"__hipstdpar_hidden_free", "__libc_free"}, {"__hipstdpar_hidden_memalign", "__libc_memalign"}, {"__hipstdpar_hidden_mmap", "mmap"}, {"__hipstdpar_hidden_munmap", "munmap"}}; PreservedAnalyses HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) { SmallDenseMap AllocReplacements(std::cbegin(ReplaceMap), std::cend(ReplaceMap)); for (auto &&F : M) { if (!F.hasName()) continue; auto It = AllocReplacements.find(F.getName()); if (It == AllocReplacements.end()) continue; if (auto R = M.getFunction(It->second)) { F.replaceAllUsesWith(R); } else { std::string W; raw_string_ostream OS(W); OS << "cannot be interposed, missing: " << AllocReplacements[F.getName()] << ". Tried to run the allocation interposition pass without the " << "replacement functions available."; F.getContext().diagnose(DiagnosticInfoUnsupported(F, W, F.getSubprogram(), DS_Warning)); } } for (auto &&HR : HiddenMap) { if (auto F = M.getFunction(HR.first)) { auto R = M.getOrInsertFunction(HR.second, F->getFunctionType(), F->getAttributes()); F->replaceAllUsesWith(R.getCallee()); eraseFromModule(*F); } } return PreservedAnalyses::none(); } static constexpr std::pair MathLibToHipStdPar[]{ {"acosh", "__hipstdpar_acosh_f64"}, {"acoshf", "__hipstdpar_acosh_f32"}, {"asinh", "__hipstdpar_asinh_f64"}, {"asinhf", "__hipstdpar_asinh_f32"}, {"atanh", "__hipstdpar_atanh_f64"}, {"atanhf", "__hipstdpar_atanh_f32"}, {"cbrt", "__hipstdpar_cbrt_f64"}, {"cbrtf", "__hipstdpar_cbrt_f32"}, {"erf", "__hipstdpar_erf_f64"}, {"erff", "__hipstdpar_erf_f32"}, {"erfc", "__hipstdpar_erfc_f64"}, {"erfcf", "__hipstdpar_erfc_f32"}, {"fdim", "__hipstdpar_fdim_f64"}, {"fdimf", "__hipstdpar_fdim_f32"}, {"expm1", "__hipstdpar_expm1_f64"}, {"expm1f", "__hipstdpar_expm1_f32"}, {"hypot", "__hipstdpar_hypot_f64"}, {"hypotf", "__hipstdpar_hypot_f32"}, {"ilogb", "__hipstdpar_ilogb_f64"}, {"ilogbf", "__hipstdpar_ilogb_f32"}, {"lgamma", "__hipstdpar_lgamma_f64"}, {"lgammaf", "__hipstdpar_lgamma_f32"}, {"log1p", "__hipstdpar_log1p_f64"}, {"log1pf", "__hipstdpar_log1p_f32"}, {"logb", "__hipstdpar_logb_f64"}, {"logbf", "__hipstdpar_logb_f32"}, {"nextafter", "__hipstdpar_nextafter_f64"}, {"nextafterf", "__hipstdpar_nextafter_f32"}, {"nexttoward", "__hipstdpar_nexttoward_f64"}, {"nexttowardf", "__hipstdpar_nexttoward_f32"}, {"remainder", "__hipstdpar_remainder_f64"}, {"remainderf", "__hipstdpar_remainder_f32"}, {"remquo", "__hipstdpar_remquo_f64"}, {"remquof", "__hipstdpar_remquo_f32"}, {"scalbln", "__hipstdpar_scalbln_f64"}, {"scalblnf", "__hipstdpar_scalbln_f32"}, {"scalbn", "__hipstdpar_scalbn_f64"}, {"scalbnf", "__hipstdpar_scalbn_f32"}, {"tgamma", "__hipstdpar_tgamma_f64"}, {"tgammaf", "__hipstdpar_tgamma_f32"}}; PreservedAnalyses HipStdParMathFixupPass::run(Module &M, ModuleAnalysisManager &) { if (M.empty()) return PreservedAnalyses::all(); SmallVector> ToReplace; for (auto &&F : M) { if (!F.hasName()) continue; StringRef N = F.getName(); Intrinsic::ID ID = F.getIntrinsicID(); switch (ID) { case Intrinsic::not_intrinsic: { auto It = find_if(MathLibToHipStdPar, [&](auto &&M) { return M.first == N; }); if (It == std::cend(MathLibToHipStdPar)) continue; ToReplace.emplace_back(&F, It->second); break; } case Intrinsic::acos: case Intrinsic::asin: case Intrinsic::atan: case Intrinsic::atan2: case Intrinsic::cosh: case Intrinsic::modf: case Intrinsic::sinh: case Intrinsic::tan: case Intrinsic::tanh: break; default: { if (F.getReturnType()->isDoubleTy()) { switch (ID) { case Intrinsic::cos: case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::log: case Intrinsic::log10: case Intrinsic::log2: case Intrinsic::pow: case Intrinsic::sin: break; default: continue; } break; } continue; } } ToReplace.emplace_back(&F, N); llvm::replace(ToReplace.back().second, '.', '_'); StringRef Prefix = "llvm"; ToReplace.back().second.replace(0, Prefix.size(), "__hipstdpar"); } for (auto &&[F, NewF] : ToReplace) F->replaceAllUsesWith( M.getOrInsertFunction(NewF, F->getFunctionType()).getCallee()); return PreservedAnalyses::none(); }