From 9fa9d9a7e1cd0a7fd8c35bdfc642793447bf70aa Mon Sep 17 00:00:00 2001 From: Fabian Mora Date: Mon, 15 Jan 2024 16:30:07 -0500 Subject: [llvm][frontend][offloading] Move clang-linker-wrapper/OffloadWrapper.* to llvm/Frontend/Offloading (#78057) This patch moves `clang/tools/clang-linker-wrapper/OffloadWrapper.*` to `llvm/Frontend/Offloading` allowing them to be re-utilized by other projects. Additionally, it makes minor modifications to the API to make it more flexible. Concretely: - The `wrap*` methods now have additional arguments `EntryArray`, `Suffix` and `EmitSurfacesAndTextures` to specify some additional options. - The `EntryArray` is now constructed by the caller. This change is needed to enable JIT compilation, as ORC doesn't fully support `__start_` and `__stop_` symbols. Thus, to JIT the code, the `EntryArray` has to be constructed explicitly in the IR. - The `Suffix` field is used when emitting the descriptor, registration methods, etc, to make them more readable. It is empty by default. - The `EmitSurfacesAndTextures` field controls whether to emit surface and texture registration code, as those functions were removed from `CUDART` in CUDA 12. It is true by default. - The function `getOffloadingEntryInitializer` was added to help create the `EntryArray`, as it returns the constant initializer and not a global variable. --- clang/test/Driver/linker-wrapper-image.c | 16 +- clang/tools/clang-linker-wrapper/CMakeLists.txt | 1 - .../clang-linker-wrapper/ClangLinkerWrapper.cpp | 15 +- .../tools/clang-linker-wrapper/OffloadWrapper.cpp | 599 -------------------- clang/tools/clang-linker-wrapper/OffloadWrapper.h | 28 - .../llvm/Frontend/Offloading/OffloadWrapper.h | 52 ++ llvm/include/llvm/Frontend/Offloading/Utility.h | 6 + llvm/lib/Frontend/Offloading/CMakeLists.txt | 2 + llvm/lib/Frontend/Offloading/OffloadWrapper.cpp | 620 +++++++++++++++++++++ llvm/lib/Frontend/Offloading/Utility.cpp | 21 +- 10 files changed, 714 insertions(+), 646 deletions(-) delete mode 100644 clang/tools/clang-linker-wrapper/OffloadWrapper.cpp delete mode 100644 clang/tools/clang-linker-wrapper/OffloadWrapper.h create mode 100644 llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h create mode 100644 llvm/lib/Frontend/Offloading/OffloadWrapper.cpp diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 03caa1e..147d315 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -45,10 +45,6 @@ // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-COFF -// CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin" -// CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 -// CUDA-NEXT: @.cuda.binary_handle = internal global ptr null - // CUDA-ELF: @__start_cuda_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry] // CUDA-ELF-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry] // CUDA-ELF-NEXT: @__dummy.cuda_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries" @@ -56,6 +52,10 @@ // CUDA-COFF: @__start_cuda_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries$OA" // CUDA-COFF-NEXT: @__stop_cuda_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries$OZ" +// CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin" +// CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 +// CUDA-NEXT: @.cuda.binary_handle = internal global ptr null + // CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }] // CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" { @@ -145,10 +145,6 @@ // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-COFF -// HIP: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin" -// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 -// HIP-NEXT: @.hip.binary_handle = internal global ptr null - // HIP-ELF: @__start_hip_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry] // HIP-ELF-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry] // HIP-ELF-NEXT: @__dummy.hip_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries" @@ -156,6 +152,10 @@ // HIP-COFF: @__start_hip_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries$OA" // HIP-COFF-NEXT: @__stop_hip_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries$OZ" +// HIP: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin" +// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 +// HIP-NEXT: @.hip.binary_handle = internal global ptr null + // HIP: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }] // HIP: define internal void @.hip.fatbin_reg() section ".text.startup" { diff --git a/clang/tools/clang-linker-wrapper/CMakeLists.txt b/clang/tools/clang-linker-wrapper/CMakeLists.txt index 744026a..5556869 100644 --- a/clang/tools/clang-linker-wrapper/CMakeLists.txt +++ b/clang/tools/clang-linker-wrapper/CMakeLists.txt @@ -28,7 +28,6 @@ endif() add_clang_tool(clang-linker-wrapper ClangLinkerWrapper.cpp - OffloadWrapper.cpp DEPENDS ${tablegen_deps} diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 122ba19..82cec17 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -14,11 +14,12 @@ // //===---------------------------------------------------------------------===// -#include "OffloadWrapper.h" #include "clang/Basic/Version.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/CodeGen/CommandFlags.h" +#include "llvm/Frontend/Offloading/OffloadWrapper.h" +#include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Module.h" @@ -906,15 +907,21 @@ wrapDeviceImages(ArrayRef> Buffers, switch (Kind) { case OFK_OpenMP: - if (Error Err = wrapOpenMPBinaries(M, BuffersToWrap)) + if (Error Err = offloading::wrapOpenMPBinaries( + M, BuffersToWrap, + offloading::getOffloadEntryArray(M, "omp_offloading_entries"))) return std::move(Err); break; case OFK_Cuda: - if (Error Err = wrapCudaBinary(M, BuffersToWrap.front())) + if (Error Err = offloading::wrapCudaBinary( + M, BuffersToWrap.front(), + offloading::getOffloadEntryArray(M, "cuda_offloading_entries"))) return std::move(Err); break; case OFK_HIP: - if (Error Err = wrapHIPBinary(M, BuffersToWrap.front())) + if (Error Err = offloading::wrapHIPBinary( + M, BuffersToWrap.front(), + offloading::getOffloadEntryArray(M, "hip_offloading_entries"))) return std::move(Err); break; default: diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp deleted file mode 100644 index 161374a..0000000 --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ /dev/null @@ -1,599 +0,0 @@ -//===- OffloadWrapper.cpp ---------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "OffloadWrapper.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/BinaryFormat/Magic.h" -#include "llvm/Frontend/Offloading/Utility.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Object/OffloadBinary.h" -#include "llvm/Support/Error.h" -#include "llvm/TargetParser/Triple.h" -#include "llvm/Transforms/Utils/ModuleUtils.h" - -using namespace llvm; - -namespace { -/// Magic number that begins the section containing the CUDA fatbinary. -constexpr unsigned CudaFatMagic = 0x466243b1; -constexpr unsigned HIPFatMagic = 0x48495046; - -IntegerType *getSizeTTy(Module &M) { - return M.getDataLayout().getIntPtrType(M.getContext()); -} - -// struct __tgt_device_image { -// void *ImageStart; -// void *ImageEnd; -// __tgt_offload_entry *EntriesBegin; -// __tgt_offload_entry *EntriesEnd; -// }; -StructType *getDeviceImageTy(Module &M) { - LLVMContext &C = M.getContext(); - StructType *ImageTy = StructType::getTypeByName(C, "__tgt_device_image"); - if (!ImageTy) - ImageTy = - StructType::create("__tgt_device_image", PointerType::getUnqual(C), - PointerType::getUnqual(C), PointerType::getUnqual(C), - PointerType::getUnqual(C)); - return ImageTy; -} - -PointerType *getDeviceImagePtrTy(Module &M) { - return PointerType::getUnqual(getDeviceImageTy(M)); -} - -// struct __tgt_bin_desc { -// int32_t NumDeviceImages; -// __tgt_device_image *DeviceImages; -// __tgt_offload_entry *HostEntriesBegin; -// __tgt_offload_entry *HostEntriesEnd; -// }; -StructType *getBinDescTy(Module &M) { - LLVMContext &C = M.getContext(); - StructType *DescTy = StructType::getTypeByName(C, "__tgt_bin_desc"); - if (!DescTy) - DescTy = StructType::create( - "__tgt_bin_desc", Type::getInt32Ty(C), getDeviceImagePtrTy(M), - PointerType::getUnqual(C), PointerType::getUnqual(C)); - return DescTy; -} - -PointerType *getBinDescPtrTy(Module &M) { - return PointerType::getUnqual(getBinDescTy(M)); -} - -/// Creates binary descriptor for the given device images. Binary descriptor -/// is an object that is passed to the offloading runtime at program startup -/// and it describes all device images available in the executable or shared -/// library. It is defined as follows -/// -/// __attribute__((visibility("hidden"))) -/// extern __tgt_offload_entry *__start_omp_offloading_entries; -/// __attribute__((visibility("hidden"))) -/// extern __tgt_offload_entry *__stop_omp_offloading_entries; -/// -/// static const char Image0[] = { }; -/// ... -/// static const char ImageN[] = { }; -/// -/// static const __tgt_device_image Images[] = { -/// { -/// Image0, /*ImageStart*/ -/// Image0 + sizeof(Image0), /*ImageEnd*/ -/// __start_omp_offloading_entries, /*EntriesBegin*/ -/// __stop_omp_offloading_entries /*EntriesEnd*/ -/// }, -/// ... -/// { -/// ImageN, /*ImageStart*/ -/// ImageN + sizeof(ImageN), /*ImageEnd*/ -/// __start_omp_offloading_entries, /*EntriesBegin*/ -/// __stop_omp_offloading_entries /*EntriesEnd*/ -/// } -/// }; -/// -/// static const __tgt_bin_desc BinDesc = { -/// sizeof(Images) / sizeof(Images[0]), /*NumDeviceImages*/ -/// Images, /*DeviceImages*/ -/// __start_omp_offloading_entries, /*HostEntriesBegin*/ -/// __stop_omp_offloading_entries /*HostEntriesEnd*/ -/// }; -/// -/// Global variable that represents BinDesc is returned. -GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs) { - LLVMContext &C = M.getContext(); - auto [EntriesB, EntriesE] = - offloading::getOffloadEntryArray(M, "omp_offloading_entries"); - - auto *Zero = ConstantInt::get(getSizeTTy(M), 0u); - Constant *ZeroZero[] = {Zero, Zero}; - - // Create initializer for the images array. - SmallVector ImagesInits; - ImagesInits.reserve(Bufs.size()); - for (ArrayRef Buf : Bufs) { - // We embed the full offloading entry so the binary utilities can parse it. - auto *Data = ConstantDataArray::get(C, Buf); - auto *Image = new GlobalVariable(M, Data->getType(), /*isConstant=*/true, - GlobalVariable::InternalLinkage, Data, - ".omp_offloading.device_image"); - Image->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - Image->setSection(".llvm.offloading"); - Image->setAlignment(Align(object::OffloadBinary::getAlignment())); - - StringRef Binary(Buf.data(), Buf.size()); - assert(identify_magic(Binary) == file_magic::offload_binary && - "Invalid binary format"); - - // The device image struct contains the pointer to the beginning and end of - // the image stored inside of the offload binary. There should only be one - // of these for each buffer so we parse it out manually. - const auto *Header = - reinterpret_cast( - Binary.bytes_begin()); - const auto *Entry = reinterpret_cast( - Binary.bytes_begin() + Header->EntryOffset); - - auto *Begin = ConstantInt::get(getSizeTTy(M), Entry->ImageOffset); - auto *Size = - ConstantInt::get(getSizeTTy(M), Entry->ImageOffset + Entry->ImageSize); - Constant *ZeroBegin[] = {Zero, Begin}; - Constant *ZeroSize[] = {Zero, Size}; - - auto *ImageB = - ConstantExpr::getGetElementPtr(Image->getValueType(), Image, ZeroBegin); - auto *ImageE = - ConstantExpr::getGetElementPtr(Image->getValueType(), Image, ZeroSize); - - ImagesInits.push_back(ConstantStruct::get(getDeviceImageTy(M), ImageB, - ImageE, EntriesB, EntriesE)); - } - - // Then create images array. - auto *ImagesData = ConstantArray::get( - ArrayType::get(getDeviceImageTy(M), ImagesInits.size()), ImagesInits); - - auto *Images = - new GlobalVariable(M, ImagesData->getType(), /*isConstant*/ true, - GlobalValue::InternalLinkage, ImagesData, - ".omp_offloading.device_images"); - Images->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - - auto *ImagesB = - ConstantExpr::getGetElementPtr(Images->getValueType(), Images, ZeroZero); - - // And finally create the binary descriptor object. - auto *DescInit = ConstantStruct::get( - getBinDescTy(M), - ConstantInt::get(Type::getInt32Ty(C), ImagesInits.size()), ImagesB, - EntriesB, EntriesE); - - return new GlobalVariable(M, DescInit->getType(), /*isConstant*/ true, - GlobalValue::InternalLinkage, DescInit, - ".omp_offloading.descriptor"); -} - -void createRegisterFunction(Module &M, GlobalVariable *BinDesc) { - LLVMContext &C = M.getContext(); - auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_reg", &M); - Func->setSection(".text.startup"); - - // Get __tgt_register_lib function declaration. - auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), - /*isVarArg*/ false); - FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); - - // Construct function body - IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(RegFuncC, BinDesc); - Builder.CreateRetVoid(); - - // Add this function to constructors. - // Set priority to 1 so that __tgt_register_lib is executed AFTER - // __tgt_register_requires (we want to know what requirements have been - // asked for before we load a libomptarget plugin so that by the time the - // plugin is loaded it can report how many devices there are which can - // satisfy these requirements). - appendToGlobalCtors(M, Func, /*Priority*/ 1); -} - -void createUnregisterFunction(Module &M, GlobalVariable *BinDesc) { - LLVMContext &C = M.getContext(); - auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, - ".omp_offloading.descriptor_unreg", &M); - Func->setSection(".text.startup"); - - // Get __tgt_unregister_lib function declaration. - auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), - /*isVarArg*/ false); - FunctionCallee UnRegFuncC = - M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); - - // Construct function body - IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); - Builder.CreateCall(UnRegFuncC, BinDesc); - Builder.CreateRetVoid(); - - // Add this function to global destructors. - // Match priority of __tgt_register_lib - appendToGlobalDtors(M, Func, /*Priority*/ 1); -} - -// struct fatbin_wrapper { -// int32_t magic; -// int32_t version; -// void *image; -// void *reserved; -//}; -StructType *getFatbinWrapperTy(Module &M) { - LLVMContext &C = M.getContext(); - StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper"); - if (!FatbinTy) - FatbinTy = StructType::create( - "fatbin_wrapper", Type::getInt32Ty(C), Type::getInt32Ty(C), - PointerType::getUnqual(C), PointerType::getUnqual(C)); - return FatbinTy; -} - -/// Embed the image \p Image into the module \p M so it can be found by the -/// runtime. -GlobalVariable *createFatbinDesc(Module &M, ArrayRef Image, bool IsHIP) { - LLVMContext &C = M.getContext(); - llvm::Type *Int8PtrTy = PointerType::getUnqual(C); - llvm::Triple Triple = llvm::Triple(M.getTargetTriple()); - - // Create the global string containing the fatbinary. - StringRef FatbinConstantSection = - IsHIP ? ".hip_fatbin" - : (Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"); - auto *Data = ConstantDataArray::get(C, Image); - auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true, - GlobalVariable::InternalLinkage, Data, - ".fatbin_image"); - Fatbin->setSection(FatbinConstantSection); - - // Create the fatbinary wrapper - StringRef FatbinWrapperSection = IsHIP ? ".hipFatBinSegment" - : Triple.isMacOSX() ? "__NV_CUDA,__fatbin" - : ".nvFatBinSegment"; - Constant *FatbinWrapper[] = { - ConstantInt::get(Type::getInt32Ty(C), IsHIP ? HIPFatMagic : CudaFatMagic), - ConstantInt::get(Type::getInt32Ty(C), 1), - ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy), - ConstantPointerNull::get(PointerType::getUnqual(C))}; - - Constant *FatbinInitializer = - ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper); - - auto *FatbinDesc = - new GlobalVariable(M, getFatbinWrapperTy(M), - /*isConstant*/ true, GlobalValue::InternalLinkage, - FatbinInitializer, ".fatbin_wrapper"); - FatbinDesc->setSection(FatbinWrapperSection); - FatbinDesc->setAlignment(Align(8)); - - return FatbinDesc; -} - -/// Create the register globals function. We will iterate all of the offloading -/// entries stored at the begin / end symbols and register them according to -/// their type. This creates the following function in IR: -/// -/// extern struct __tgt_offload_entry __start_cuda_offloading_entries; -/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries; -/// -/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int, -/// void *, void *, void *, void *, int *); -/// extern void __cudaRegisterVar(void **, void *, void *, void *, int32_t, -/// int64_t, int32_t, int32_t); -/// -/// void __cudaRegisterTest(void **fatbinHandle) { -/// for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries; -/// entry != &__stop_cuda_offloading_entries; ++entry) { -/// if (!entry->size) -/// __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name, -/// entry->name, -1, 0, 0, 0, 0, 0); -/// else -/// __cudaRegisterVar(fatbinHandle, entry->addr, entry->name, entry->name, -/// 0, entry->size, 0, 0); -/// } -/// } -Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) { - LLVMContext &C = M.getContext(); - auto [EntriesB, EntriesE] = offloading::getOffloadEntryArray( - M, IsHIP ? "hip_offloading_entries" : "cuda_offloading_entries"); - - // Get the __cudaRegisterFunction function declaration. - PointerType *Int8PtrTy = PointerType::get(C, 0); - PointerType *Int8PtrPtrTy = PointerType::get(C, 0); - PointerType *Int32PtrTy = PointerType::get(C, 0); - auto *RegFuncTy = FunctionType::get( - Type::getInt32Ty(C), - {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Type::getInt32Ty(C), - Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int32PtrTy}, - /*isVarArg*/ false); - FunctionCallee RegFunc = M.getOrInsertFunction( - IsHIP ? "__hipRegisterFunction" : "__cudaRegisterFunction", RegFuncTy); - - // Get the __cudaRegisterVar function declaration. - auto *RegVarTy = FunctionType::get( - Type::getVoidTy(C), - {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Type::getInt32Ty(C), - getSizeTTy(M), Type::getInt32Ty(C), Type::getInt32Ty(C)}, - /*isVarArg*/ false); - FunctionCallee RegVar = M.getOrInsertFunction( - IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy); - - // Get the __cudaRegisterSurface function declaration. - auto *RegSurfaceTy = - FunctionType::get(Type::getVoidTy(C), - {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, - Type::getInt32Ty(C), Type::getInt32Ty(C)}, - /*isVarArg=*/false); - FunctionCallee RegSurface = M.getOrInsertFunction( - IsHIP ? "__hipRegisterSurface" : "__cudaRegisterSurface", RegSurfaceTy); - - // Get the __cudaRegisterTexture function declaration. - auto *RegTextureTy = FunctionType::get( - Type::getVoidTy(C), - {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Type::getInt32Ty(C), - Type::getInt32Ty(C), Type::getInt32Ty(C)}, - /*isVarArg=*/false); - FunctionCallee RegTexture = M.getOrInsertFunction( - IsHIP ? "__hipRegisterTexture" : "__cudaRegisterTexture", RegTextureTy); - - auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C), Int8PtrPtrTy, - /*isVarArg*/ false); - auto *RegGlobalsFn = - Function::Create(RegGlobalsTy, GlobalValue::InternalLinkage, - IsHIP ? ".hip.globals_reg" : ".cuda.globals_reg", &M); - RegGlobalsFn->setSection(".text.startup"); - - // Create the loop to register all the entries. - IRBuilder<> Builder(BasicBlock::Create(C, "entry", RegGlobalsFn)); - auto *EntryBB = BasicBlock::Create(C, "while.entry", RegGlobalsFn); - auto *IfThenBB = BasicBlock::Create(C, "if.then", RegGlobalsFn); - auto *IfElseBB = BasicBlock::Create(C, "if.else", RegGlobalsFn); - auto *SwGlobalBB = BasicBlock::Create(C, "sw.global", RegGlobalsFn); - auto *SwManagedBB = BasicBlock::Create(C, "sw.managed", RegGlobalsFn); - auto *SwSurfaceBB = BasicBlock::Create(C, "sw.surface", RegGlobalsFn); - auto *SwTextureBB = BasicBlock::Create(C, "sw.texture", RegGlobalsFn); - auto *IfEndBB = BasicBlock::Create(C, "if.end", RegGlobalsFn); - auto *ExitBB = BasicBlock::Create(C, "while.end", RegGlobalsFn); - - auto *EntryCmp = Builder.CreateICmpNE(EntriesB, EntriesE); - Builder.CreateCondBr(EntryCmp, EntryBB, ExitBB); - Builder.SetInsertPoint(EntryBB); - auto *Entry = Builder.CreatePHI(PointerType::getUnqual(C), 2, "entry"); - auto *AddrPtr = - Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, - {ConstantInt::get(getSizeTTy(M), 0), - ConstantInt::get(Type::getInt32Ty(C), 0)}); - auto *Addr = Builder.CreateLoad(Int8PtrTy, AddrPtr, "addr"); - auto *NamePtr = - Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, - {ConstantInt::get(getSizeTTy(M), 0), - ConstantInt::get(Type::getInt32Ty(C), 1)}); - auto *Name = Builder.CreateLoad(Int8PtrTy, NamePtr, "name"); - auto *SizePtr = - Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, - {ConstantInt::get(getSizeTTy(M), 0), - ConstantInt::get(Type::getInt32Ty(C), 2)}); - auto *Size = Builder.CreateLoad(getSizeTTy(M), SizePtr, "size"); - auto *FlagsPtr = - Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, - {ConstantInt::get(getSizeTTy(M), 0), - ConstantInt::get(Type::getInt32Ty(C), 3)}); - auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flags"); - auto *DataPtr = - Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, - {ConstantInt::get(getSizeTTy(M), 0), - ConstantInt::get(Type::getInt32Ty(C), 4)}); - auto *Data = Builder.CreateLoad(Type::getInt32Ty(C), DataPtr, "textype"); - auto *Kind = Builder.CreateAnd( - Flags, ConstantInt::get(Type::getInt32Ty(C), 0x7), "type"); - - // Extract the flags stored in the bit-field and convert them to C booleans. - auto *ExternBit = Builder.CreateAnd( - Flags, ConstantInt::get(Type::getInt32Ty(C), - llvm::offloading::OffloadGlobalExtern)); - auto *Extern = Builder.CreateLShr( - ExternBit, ConstantInt::get(Type::getInt32Ty(C), 3), "extern"); - auto *ConstantBit = Builder.CreateAnd( - Flags, ConstantInt::get(Type::getInt32Ty(C), - llvm::offloading::OffloadGlobalConstant)); - auto *Const = Builder.CreateLShr( - ConstantBit, ConstantInt::get(Type::getInt32Ty(C), 4), "constant"); - auto *NormalizedBit = Builder.CreateAnd( - Flags, ConstantInt::get(Type::getInt32Ty(C), - llvm::offloading::OffloadGlobalNormalized)); - auto *Normalized = Builder.CreateLShr( - NormalizedBit, ConstantInt::get(Type::getInt32Ty(C), 5), "normalized"); - auto *FnCond = - Builder.CreateICmpEQ(Size, ConstantInt::getNullValue(getSizeTTy(M))); - Builder.CreateCondBr(FnCond, IfThenBB, IfElseBB); - - // Create kernel registration code. - Builder.SetInsertPoint(IfThenBB); - Builder.CreateCall(RegFunc, {RegGlobalsFn->arg_begin(), Addr, Name, Name, - ConstantInt::get(Type::getInt32Ty(C), -1), - ConstantPointerNull::get(Int8PtrTy), - ConstantPointerNull::get(Int8PtrTy), - ConstantPointerNull::get(Int8PtrTy), - ConstantPointerNull::get(Int8PtrTy), - ConstantPointerNull::get(Int32PtrTy)}); - Builder.CreateBr(IfEndBB); - Builder.SetInsertPoint(IfElseBB); - - auto *Switch = Builder.CreateSwitch(Kind, IfEndBB); - // Create global variable registration code. - Builder.SetInsertPoint(SwGlobalBB); - Builder.CreateCall(RegVar, - {RegGlobalsFn->arg_begin(), Addr, Name, Name, Extern, Size, - Const, ConstantInt::get(Type::getInt32Ty(C), 0)}); - Builder.CreateBr(IfEndBB); - Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalEntry), - SwGlobalBB); - - // Create managed variable registration code. - Builder.SetInsertPoint(SwManagedBB); - Builder.CreateBr(IfEndBB); - Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalManagedEntry), - SwManagedBB); - - // Create surface variable registration code. - Builder.SetInsertPoint(SwSurfaceBB); - Builder.CreateCall( - RegSurface, {RegGlobalsFn->arg_begin(), Addr, Name, Name, Data, Extern}); - Builder.CreateBr(IfEndBB); - Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalSurfaceEntry), - SwSurfaceBB); - - // Create texture variable registration code. - Builder.SetInsertPoint(SwTextureBB); - Builder.CreateCall(RegTexture, {RegGlobalsFn->arg_begin(), Addr, Name, Name, - Data, Normalized, Extern}); - Builder.CreateBr(IfEndBB); - Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalTextureEntry), - SwTextureBB); - - Builder.SetInsertPoint(IfEndBB); - auto *NewEntry = Builder.CreateInBoundsGEP( - offloading::getEntryTy(M), Entry, ConstantInt::get(getSizeTTy(M), 1)); - auto *Cmp = Builder.CreateICmpEQ( - NewEntry, - ConstantExpr::getInBoundsGetElementPtr( - ArrayType::get(offloading::getEntryTy(M), 0), EntriesE, - ArrayRef({ConstantInt::get(getSizeTTy(M), 0), - ConstantInt::get(getSizeTTy(M), 0)}))); - Entry->addIncoming( - ConstantExpr::getInBoundsGetElementPtr( - ArrayType::get(offloading::getEntryTy(M), 0), EntriesB, - ArrayRef({ConstantInt::get(getSizeTTy(M), 0), - ConstantInt::get(getSizeTTy(M), 0)})), - &RegGlobalsFn->getEntryBlock()); - Entry->addIncoming(NewEntry, IfEndBB); - Builder.CreateCondBr(Cmp, ExitBB, EntryBB); - Builder.SetInsertPoint(ExitBB); - Builder.CreateRetVoid(); - - return RegGlobalsFn; -} - -// Create the constructor and destructor to register the fatbinary with the CUDA -// runtime. -void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc, - bool IsHIP) { - LLVMContext &C = M.getContext(); - auto *CtorFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *CtorFunc = - Function::Create(CtorFuncTy, GlobalValue::InternalLinkage, - IsHIP ? ".hip.fatbin_reg" : ".cuda.fatbin_reg", &M); - CtorFunc->setSection(".text.startup"); - - auto *DtorFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *DtorFunc = - Function::Create(DtorFuncTy, GlobalValue::InternalLinkage, - IsHIP ? ".hip.fatbin_unreg" : ".cuda.fatbin_unreg", &M); - DtorFunc->setSection(".text.startup"); - - auto *PtrTy = PointerType::getUnqual(C); - - // Get the __cudaRegisterFatBinary function declaration. - auto *RegFatTy = FunctionType::get(PtrTy, PtrTy, /*isVarArg=*/false); - FunctionCallee RegFatbin = M.getOrInsertFunction( - IsHIP ? "__hipRegisterFatBinary" : "__cudaRegisterFatBinary", RegFatTy); - // Get the __cudaRegisterFatBinaryEnd function declaration. - auto *RegFatEndTy = - FunctionType::get(Type::getVoidTy(C), PtrTy, /*isVarArg=*/false); - FunctionCallee RegFatbinEnd = - M.getOrInsertFunction("__cudaRegisterFatBinaryEnd", RegFatEndTy); - // Get the __cudaUnregisterFatBinary function declaration. - auto *UnregFatTy = - FunctionType::get(Type::getVoidTy(C), PtrTy, /*isVarArg=*/false); - FunctionCallee UnregFatbin = M.getOrInsertFunction( - IsHIP ? "__hipUnregisterFatBinary" : "__cudaUnregisterFatBinary", - UnregFatTy); - - auto *AtExitTy = - FunctionType::get(Type::getInt32Ty(C), PtrTy, /*isVarArg=*/false); - FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy); - - auto *BinaryHandleGlobal = new llvm::GlobalVariable( - M, PtrTy, false, llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(PtrTy), - IsHIP ? ".hip.binary_handle" : ".cuda.binary_handle"); - - // Create the constructor to register this image with the runtime. - IRBuilder<> CtorBuilder(BasicBlock::Create(C, "entry", CtorFunc)); - CallInst *Handle = CtorBuilder.CreateCall( - RegFatbin, - ConstantExpr::getPointerBitCastOrAddrSpaceCast(FatbinDesc, PtrTy)); - CtorBuilder.CreateAlignedStore( - Handle, BinaryHandleGlobal, - Align(M.getDataLayout().getPointerTypeSize(PtrTy))); - CtorBuilder.CreateCall(createRegisterGlobalsFunction(M, IsHIP), Handle); - if (!IsHIP) - CtorBuilder.CreateCall(RegFatbinEnd, Handle); - CtorBuilder.CreateCall(AtExit, DtorFunc); - CtorBuilder.CreateRetVoid(); - - // Create the destructor to unregister the image with the runtime. We cannot - // use a standard global destructor after CUDA 9.2 so this must be called by - // `atexit()` intead. - IRBuilder<> DtorBuilder(BasicBlock::Create(C, "entry", DtorFunc)); - LoadInst *BinaryHandle = DtorBuilder.CreateAlignedLoad( - PtrTy, BinaryHandleGlobal, - Align(M.getDataLayout().getPointerTypeSize(PtrTy))); - DtorBuilder.CreateCall(UnregFatbin, BinaryHandle); - DtorBuilder.CreateRetVoid(); - - // Add this function to constructors. - appendToGlobalCtors(M, CtorFunc, /*Priority*/ 1); -} - -} // namespace - -Error wrapOpenMPBinaries(Module &M, ArrayRef> Images) { - GlobalVariable *Desc = createBinDesc(M, Images); - if (!Desc) - return createStringError(inconvertibleErrorCode(), - "No binary descriptors created."); - createRegisterFunction(M, Desc); - createUnregisterFunction(M, Desc); - return Error::success(); -} - -Error wrapCudaBinary(Module &M, ArrayRef Image) { - GlobalVariable *Desc = createFatbinDesc(M, Image, /* IsHIP */ false); - if (!Desc) - return createStringError(inconvertibleErrorCode(), - "No fatinbary section created."); - - createRegisterFatbinFunction(M, Desc, /* IsHIP */ false); - return Error::success(); -} - -Error wrapHIPBinary(Module &M, ArrayRef Image) { - GlobalVariable *Desc = createFatbinDesc(M, Image, /* IsHIP */ true); - if (!Desc) - return createStringError(inconvertibleErrorCode(), - "No fatinbary section created."); - - createRegisterFatbinFunction(M, Desc, /* IsHIP */ true); - return Error::success(); -} diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h deleted file mode 100644 index 6793339..0000000 --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ /dev/null @@ -1,28 +0,0 @@ -//===- OffloadWrapper.h --r-------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CLANG_TOOLS_CLANG_LINKER_WRAPPER_OFFLOAD_WRAPPER_H -#define LLVM_CLANG_TOOLS_CLANG_LINKER_WRAPPER_OFFLOAD_WRAPPER_H - -#include "llvm/ADT/ArrayRef.h" -#include "llvm/IR/Module.h" - -/// Wraps the input device images into the module \p M as global symbols and -/// registers the images with the OpenMP Offloading runtime libomptarget. -llvm::Error wrapOpenMPBinaries(llvm::Module &M, - llvm::ArrayRef> Images); - -/// Wraps the input fatbinary image into the module \p M as global symbols and -/// registers the images with the CUDA runtime. -llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images); - -/// Wraps the input bundled image into the module \p M as global symbols and -/// registers the images with the HIP runtime. -llvm::Error wrapHIPBinary(llvm::Module &M, llvm::ArrayRef Images); - -#endif diff --git a/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h new file mode 100644 index 0000000..e3ded00 --- /dev/null +++ b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h @@ -0,0 +1,52 @@ +//===- OffloadWrapper.h --r-------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FRONTEND_OFFLOADING_OFFLOADWRAPPER_H +#define LLVM_FRONTEND_OFFLOADING_OFFLOADWRAPPER_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/IR/Module.h" + +namespace llvm { +namespace offloading { +using EntryArrayTy = std::pair; +/// Wraps the input device images into the module \p M as global symbols and +/// registers the images with the OpenMP Offloading runtime libomptarget. +/// \param EntryArray Optional pair pointing to the `__start` and `__stop` +/// symbols holding the `__tgt_offload_entry` array. +/// \param Suffix An optional suffix appended to the emitted symbols. +llvm::Error wrapOpenMPBinaries(llvm::Module &M, + llvm::ArrayRef> Images, + EntryArrayTy EntryArray, + llvm::StringRef Suffix = ""); + +/// Wraps the input fatbinary image into the module \p M as global symbols and +/// registers the images with the CUDA runtime. +/// \param EntryArray Optional pair pointing to the `__start` and `__stop` +/// symbols holding the `__tgt_offload_entry` array. +/// \param Suffix An optional suffix appended to the emitted symbols. +/// \param EmitSurfacesAndTextures Whether to emit surface and textures +/// registration code. It defaults to false. +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images, + EntryArrayTy EntryArray, llvm::StringRef Suffix = "", + bool EmitSurfacesAndTextures = true); + +/// Wraps the input bundled image into the module \p M as global symbols and +/// registers the images with the HIP runtime. +/// \param EntryArray Optional pair pointing to the `__start` and `__stop` +/// symbols holding the `__tgt_offload_entry` array. +/// \param Suffix An optional suffix appended to the emitted symbols. +/// \param EmitSurfacesAndTextures Whether to emit surface and textures +/// registration code. It defaults to false. +llvm::Error wrapHIPBinary(llvm::Module &M, llvm::ArrayRef Images, + EntryArrayTy EntryArray, llvm::StringRef Suffix = "", + bool EmitSurfacesAndTextures = true); +} // namespace offloading +} // namespace llvm + +#endif // LLVM_FRONTEND_OFFLOADING_OFFLOADWRAPPER_H diff --git a/llvm/include/llvm/Frontend/Offloading/Utility.h b/llvm/include/llvm/Frontend/Offloading/Utility.h index 520c192..f54dd7b 100644 --- a/llvm/include/llvm/Frontend/Offloading/Utility.h +++ b/llvm/include/llvm/Frontend/Offloading/Utility.h @@ -61,6 +61,12 @@ StructType *getEntryTy(Module &M); void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName); +/// Create a constant struct initializer used to register this global at +/// runtime. +/// \return the constant struct and the global variable holding the symbol name. +std::pair +getOffloadingEntryInitializer(Module &M, Constant *Addr, StringRef Name, + uint64_t Size, int32_t Flags, int32_t Data); /// Creates a pair of globals used to iterate the array of offloading entries by /// accessing the section variables provided by the linker. diff --git a/llvm/lib/Frontend/Offloading/CMakeLists.txt b/llvm/lib/Frontend/Offloading/CMakeLists.txt index 2d0117c..16e0dcf 100644 --- a/llvm/lib/Frontend/Offloading/CMakeLists.txt +++ b/llvm/lib/Frontend/Offloading/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_component_library(LLVMFrontendOffloading Utility.cpp + OffloadWrapper.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend @@ -9,6 +10,7 @@ add_llvm_component_library(LLVMFrontendOffloading LINK_COMPONENTS Core + BinaryFormat Support TransformUtils TargetParser diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp new file mode 100644 index 0000000..76a8eeb --- /dev/null +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -0,0 +1,620 @@ +//===- OffloadWrapper.cpp ---------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Frontend/Offloading/OffloadWrapper.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/BinaryFormat/Magic.h" +#include "llvm/Frontend/Offloading/Utility.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Object/OffloadBinary.h" +#include "llvm/Support/Error.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; +using namespace llvm::offloading; + +namespace { +/// Magic number that begins the section containing the CUDA fatbinary. +constexpr unsigned CudaFatMagic = 0x466243b1; +constexpr unsigned HIPFatMagic = 0x48495046; + +IntegerType *getSizeTTy(Module &M) { + return M.getDataLayout().getIntPtrType(M.getContext()); +} + +// struct __tgt_device_image { +// void *ImageStart; +// void *ImageEnd; +// __tgt_offload_entry *EntriesBegin; +// __tgt_offload_entry *EntriesEnd; +// }; +StructType *getDeviceImageTy(Module &M) { + LLVMContext &C = M.getContext(); + StructType *ImageTy = StructType::getTypeByName(C, "__tgt_device_image"); + if (!ImageTy) + ImageTy = + StructType::create("__tgt_device_image", PointerType::getUnqual(C), + PointerType::getUnqual(C), PointerType::getUnqual(C), + PointerType::getUnqual(C)); + return ImageTy; +} + +PointerType *getDeviceImagePtrTy(Module &M) { + return PointerType::getUnqual(getDeviceImageTy(M)); +} + +// struct __tgt_bin_desc { +// int32_t NumDeviceImages; +// __tgt_device_image *DeviceImages; +// __tgt_offload_entry *HostEntriesBegin; +// __tgt_offload_entry *HostEntriesEnd; +// }; +StructType *getBinDescTy(Module &M) { + LLVMContext &C = M.getContext(); + StructType *DescTy = StructType::getTypeByName(C, "__tgt_bin_desc"); + if (!DescTy) + DescTy = StructType::create( + "__tgt_bin_desc", Type::getInt32Ty(C), getDeviceImagePtrTy(M), + PointerType::getUnqual(C), PointerType::getUnqual(C)); + return DescTy; +} + +PointerType *getBinDescPtrTy(Module &M) { + return PointerType::getUnqual(getBinDescTy(M)); +} + +/// Creates binary descriptor for the given device images. Binary descriptor +/// is an object that is passed to the offloading runtime at program startup +/// and it describes all device images available in the executable or shared +/// library. It is defined as follows +/// +/// __attribute__((visibility("hidden"))) +/// extern __tgt_offload_entry *__start_omp_offloading_entries; +/// __attribute__((visibility("hidden"))) +/// extern __tgt_offload_entry *__stop_omp_offloading_entries; +/// +/// static const char Image0[] = { }; +/// ... +/// static const char ImageN[] = { }; +/// +/// static const __tgt_device_image Images[] = { +/// { +/// Image0, /*ImageStart*/ +/// Image0 + sizeof(Image0), /*ImageEnd*/ +/// __start_omp_offloading_entries, /*EntriesBegin*/ +/// __stop_omp_offloading_entries /*EntriesEnd*/ +/// }, +/// ... +/// { +/// ImageN, /*ImageStart*/ +/// ImageN + sizeof(ImageN), /*ImageEnd*/ +/// __start_omp_offloading_entries, /*EntriesBegin*/ +/// __stop_omp_offloading_entries /*EntriesEnd*/ +/// } +/// }; +/// +/// static const __tgt_bin_desc BinDesc = { +/// sizeof(Images) / sizeof(Images[0]), /*NumDeviceImages*/ +/// Images, /*DeviceImages*/ +/// __start_omp_offloading_entries, /*HostEntriesBegin*/ +/// __stop_omp_offloading_entries /*HostEntriesEnd*/ +/// }; +/// +/// Global variable that represents BinDesc is returned. +GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, + EntryArrayTy EntryArray, StringRef Suffix) { + LLVMContext &C = M.getContext(); + auto [EntriesB, EntriesE] = EntryArray; + + auto *Zero = ConstantInt::get(getSizeTTy(M), 0u); + Constant *ZeroZero[] = {Zero, Zero}; + + // Create initializer for the images array. + SmallVector ImagesInits; + ImagesInits.reserve(Bufs.size()); + for (ArrayRef Buf : Bufs) { + // We embed the full offloading entry so the binary utilities can parse it. + auto *Data = ConstantDataArray::get(C, Buf); + auto *Image = new GlobalVariable(M, Data->getType(), /*isConstant=*/true, + GlobalVariable::InternalLinkage, Data, + ".omp_offloading.device_image" + Suffix); + Image->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + Image->setSection(".llvm.offloading"); + Image->setAlignment(Align(object::OffloadBinary::getAlignment())); + + StringRef Binary(Buf.data(), Buf.size()); + assert(identify_magic(Binary) == file_magic::offload_binary && + "Invalid binary format"); + + // The device image struct contains the pointer to the beginning and end of + // the image stored inside of the offload binary. There should only be one + // of these for each buffer so we parse it out manually. + const auto *Header = + reinterpret_cast( + Binary.bytes_begin()); + const auto *Entry = reinterpret_cast( + Binary.bytes_begin() + Header->EntryOffset); + + auto *Begin = ConstantInt::get(getSizeTTy(M), Entry->ImageOffset); + auto *Size = + ConstantInt::get(getSizeTTy(M), Entry->ImageOffset + Entry->ImageSize); + Constant *ZeroBegin[] = {Zero, Begin}; + Constant *ZeroSize[] = {Zero, Size}; + + auto *ImageB = + ConstantExpr::getGetElementPtr(Image->getValueType(), Image, ZeroBegin); + auto *ImageE = + ConstantExpr::getGetElementPtr(Image->getValueType(), Image, ZeroSize); + + ImagesInits.push_back(ConstantStruct::get(getDeviceImageTy(M), ImageB, + ImageE, EntriesB, EntriesE)); + } + + // Then create images array. + auto *ImagesData = ConstantArray::get( + ArrayType::get(getDeviceImageTy(M), ImagesInits.size()), ImagesInits); + + auto *Images = + new GlobalVariable(M, ImagesData->getType(), /*isConstant*/ true, + GlobalValue::InternalLinkage, ImagesData, + ".omp_offloading.device_images" + Suffix); + Images->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + auto *ImagesB = + ConstantExpr::getGetElementPtr(Images->getValueType(), Images, ZeroZero); + + // And finally create the binary descriptor object. + auto *DescInit = ConstantStruct::get( + getBinDescTy(M), + ConstantInt::get(Type::getInt32Ty(C), ImagesInits.size()), ImagesB, + EntriesB, EntriesE); + + return new GlobalVariable(M, DescInit->getType(), /*isConstant*/ true, + GlobalValue::InternalLinkage, DescInit, + ".omp_offloading.descriptor" + Suffix); +} + +void createRegisterFunction(Module &M, GlobalVariable *BinDesc, + StringRef Suffix) { + LLVMContext &C = M.getContext(); + auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_reg" + Suffix, &M); + Func->setSection(".text.startup"); + + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + + // Construct function body + IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); + Builder.CreateCall(RegFuncC, BinDesc); + Builder.CreateRetVoid(); + + // Add this function to constructors. + // Set priority to 1 so that __tgt_register_lib is executed AFTER + // __tgt_register_requires (we want to know what requirements have been + // asked for before we load a libomptarget plugin so that by the time the + // plugin is loaded it can report how many devices there are which can + // satisfy these requirements). + appendToGlobalCtors(M, Func, /*Priority*/ 1); +} + +void createUnregisterFunction(Module &M, GlobalVariable *BinDesc, + StringRef Suffix) { + LLVMContext &C = M.getContext(); + auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *Func = + Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_unreg" + Suffix, &M); + Func->setSection(".text.startup"); + + // Get __tgt_unregister_lib function declaration. + auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), + /*isVarArg*/ false); + FunctionCallee UnRegFuncC = + M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); + + // Construct function body + IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); + Builder.CreateCall(UnRegFuncC, BinDesc); + Builder.CreateRetVoid(); + + // Add this function to global destructors. + // Match priority of __tgt_register_lib + appendToGlobalDtors(M, Func, /*Priority*/ 1); +} + +// struct fatbin_wrapper { +// int32_t magic; +// int32_t version; +// void *image; +// void *reserved; +//}; +StructType *getFatbinWrapperTy(Module &M) { + LLVMContext &C = M.getContext(); + StructType *FatbinTy = StructType::getTypeByName(C, "fatbin_wrapper"); + if (!FatbinTy) + FatbinTy = StructType::create( + "fatbin_wrapper", Type::getInt32Ty(C), Type::getInt32Ty(C), + PointerType::getUnqual(C), PointerType::getUnqual(C)); + return FatbinTy; +} + +/// Embed the image \p Image into the module \p M so it can be found by the +/// runtime. +GlobalVariable *createFatbinDesc(Module &M, ArrayRef Image, bool IsHIP, + StringRef Suffix) { + LLVMContext &C = M.getContext(); + llvm::Type *Int8PtrTy = PointerType::getUnqual(C); + llvm::Triple Triple = llvm::Triple(M.getTargetTriple()); + + // Create the global string containing the fatbinary. + StringRef FatbinConstantSection = + IsHIP ? ".hip_fatbin" + : (Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"); + auto *Data = ConstantDataArray::get(C, Image); + auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true, + GlobalVariable::InternalLinkage, Data, + ".fatbin_image" + Suffix); + Fatbin->setSection(FatbinConstantSection); + + // Create the fatbinary wrapper + StringRef FatbinWrapperSection = IsHIP ? ".hipFatBinSegment" + : Triple.isMacOSX() ? "__NV_CUDA,__fatbin" + : ".nvFatBinSegment"; + Constant *FatbinWrapper[] = { + ConstantInt::get(Type::getInt32Ty(C), IsHIP ? HIPFatMagic : CudaFatMagic), + ConstantInt::get(Type::getInt32Ty(C), 1), + ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy), + ConstantPointerNull::get(PointerType::getUnqual(C))}; + + Constant *FatbinInitializer = + ConstantStruct::get(getFatbinWrapperTy(M), FatbinWrapper); + + auto *FatbinDesc = + new GlobalVariable(M, getFatbinWrapperTy(M), + /*isConstant*/ true, GlobalValue::InternalLinkage, + FatbinInitializer, ".fatbin_wrapper" + Suffix); + FatbinDesc->setSection(FatbinWrapperSection); + FatbinDesc->setAlignment(Align(8)); + + return FatbinDesc; +} + +/// Create the register globals function. We will iterate all of the offloading +/// entries stored at the begin / end symbols and register them according to +/// their type. This creates the following function in IR: +/// +/// extern struct __tgt_offload_entry __start_cuda_offloading_entries; +/// extern struct __tgt_offload_entry __stop_cuda_offloading_entries; +/// +/// extern void __cudaRegisterFunction(void **, void *, void *, void *, int, +/// void *, void *, void *, void *, int *); +/// extern void __cudaRegisterVar(void **, void *, void *, void *, int32_t, +/// int64_t, int32_t, int32_t); +/// +/// void __cudaRegisterTest(void **fatbinHandle) { +/// for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries; +/// entry != &__stop_cuda_offloading_entries; ++entry) { +/// if (!entry->size) +/// __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name, +/// entry->name, -1, 0, 0, 0, 0, 0); +/// else +/// __cudaRegisterVar(fatbinHandle, entry->addr, entry->name, entry->name, +/// 0, entry->size, 0, 0); +/// } +/// } +Function *createRegisterGlobalsFunction(Module &M, bool IsHIP, + EntryArrayTy EntryArray, + StringRef Suffix, + bool EmitSurfacesAndTextures) { + LLVMContext &C = M.getContext(); + auto [EntriesB, EntriesE] = EntryArray; + + // Get the __cudaRegisterFunction function declaration. + PointerType *Int8PtrTy = PointerType::get(C, 0); + PointerType *Int8PtrPtrTy = PointerType::get(C, 0); + PointerType *Int32PtrTy = PointerType::get(C, 0); + auto *RegFuncTy = FunctionType::get( + Type::getInt32Ty(C), + {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Type::getInt32Ty(C), + Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int32PtrTy}, + /*isVarArg*/ false); + FunctionCallee RegFunc = M.getOrInsertFunction( + IsHIP ? "__hipRegisterFunction" : "__cudaRegisterFunction", RegFuncTy); + + // Get the __cudaRegisterVar function declaration. + auto *RegVarTy = FunctionType::get( + Type::getVoidTy(C), + {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Type::getInt32Ty(C), + getSizeTTy(M), Type::getInt32Ty(C), Type::getInt32Ty(C)}, + /*isVarArg*/ false); + FunctionCallee RegVar = M.getOrInsertFunction( + IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy); + + // Get the __cudaRegisterSurface function declaration. + FunctionType *RegSurfaceTy = + FunctionType::get(Type::getVoidTy(C), + {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, + Type::getInt32Ty(C), Type::getInt32Ty(C)}, + /*isVarArg=*/false); + FunctionCallee RegSurface = M.getOrInsertFunction( + IsHIP ? "__hipRegisterSurface" : "__cudaRegisterSurface", RegSurfaceTy); + + // Get the __cudaRegisterTexture function declaration. + FunctionType *RegTextureTy = FunctionType::get( + Type::getVoidTy(C), + {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Type::getInt32Ty(C), + Type::getInt32Ty(C), Type::getInt32Ty(C)}, + /*isVarArg=*/false); + FunctionCallee RegTexture = M.getOrInsertFunction( + IsHIP ? "__hipRegisterTexture" : "__cudaRegisterTexture", RegTextureTy); + + auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C), Int8PtrPtrTy, + /*isVarArg*/ false); + auto *RegGlobalsFn = + Function::Create(RegGlobalsTy, GlobalValue::InternalLinkage, + IsHIP ? ".hip.globals_reg" : ".cuda.globals_reg", &M); + RegGlobalsFn->setSection(".text.startup"); + + // Create the loop to register all the entries. + IRBuilder<> Builder(BasicBlock::Create(C, "entry", RegGlobalsFn)); + auto *EntryBB = BasicBlock::Create(C, "while.entry", RegGlobalsFn); + auto *IfThenBB = BasicBlock::Create(C, "if.then", RegGlobalsFn); + auto *IfElseBB = BasicBlock::Create(C, "if.else", RegGlobalsFn); + auto *SwGlobalBB = BasicBlock::Create(C, "sw.global", RegGlobalsFn); + auto *SwManagedBB = BasicBlock::Create(C, "sw.managed", RegGlobalsFn); + auto *SwSurfaceBB = BasicBlock::Create(C, "sw.surface", RegGlobalsFn); + auto *SwTextureBB = BasicBlock::Create(C, "sw.texture", RegGlobalsFn); + auto *IfEndBB = BasicBlock::Create(C, "if.end", RegGlobalsFn); + auto *ExitBB = BasicBlock::Create(C, "while.end", RegGlobalsFn); + + auto *EntryCmp = Builder.CreateICmpNE(EntriesB, EntriesE); + Builder.CreateCondBr(EntryCmp, EntryBB, ExitBB); + Builder.SetInsertPoint(EntryBB); + auto *Entry = Builder.CreatePHI(PointerType::getUnqual(C), 2, "entry"); + auto *AddrPtr = + Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, + {ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(Type::getInt32Ty(C), 0)}); + auto *Addr = Builder.CreateLoad(Int8PtrTy, AddrPtr, "addr"); + auto *NamePtr = + Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, + {ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(Type::getInt32Ty(C), 1)}); + auto *Name = Builder.CreateLoad(Int8PtrTy, NamePtr, "name"); + auto *SizePtr = + Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, + {ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(Type::getInt32Ty(C), 2)}); + auto *Size = Builder.CreateLoad(getSizeTTy(M), SizePtr, "size"); + auto *FlagsPtr = + Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, + {ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(Type::getInt32Ty(C), 3)}); + auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flags"); + auto *DataPtr = + Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, + {ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(Type::getInt32Ty(C), 4)}); + auto *Data = Builder.CreateLoad(Type::getInt32Ty(C), DataPtr, "textype"); + auto *Kind = Builder.CreateAnd( + Flags, ConstantInt::get(Type::getInt32Ty(C), 0x7), "type"); + + // Extract the flags stored in the bit-field and convert them to C booleans. + auto *ExternBit = Builder.CreateAnd( + Flags, ConstantInt::get(Type::getInt32Ty(C), + llvm::offloading::OffloadGlobalExtern)); + auto *Extern = Builder.CreateLShr( + ExternBit, ConstantInt::get(Type::getInt32Ty(C), 3), "extern"); + auto *ConstantBit = Builder.CreateAnd( + Flags, ConstantInt::get(Type::getInt32Ty(C), + llvm::offloading::OffloadGlobalConstant)); + auto *Const = Builder.CreateLShr( + ConstantBit, ConstantInt::get(Type::getInt32Ty(C), 4), "constant"); + auto *NormalizedBit = Builder.CreateAnd( + Flags, ConstantInt::get(Type::getInt32Ty(C), + llvm::offloading::OffloadGlobalNormalized)); + auto *Normalized = Builder.CreateLShr( + NormalizedBit, ConstantInt::get(Type::getInt32Ty(C), 5), "normalized"); + auto *FnCond = + Builder.CreateICmpEQ(Size, ConstantInt::getNullValue(getSizeTTy(M))); + Builder.CreateCondBr(FnCond, IfThenBB, IfElseBB); + + // Create kernel registration code. + Builder.SetInsertPoint(IfThenBB); + Builder.CreateCall(RegFunc, {RegGlobalsFn->arg_begin(), Addr, Name, Name, + ConstantInt::get(Type::getInt32Ty(C), -1), + ConstantPointerNull::get(Int8PtrTy), + ConstantPointerNull::get(Int8PtrTy), + ConstantPointerNull::get(Int8PtrTy), + ConstantPointerNull::get(Int8PtrTy), + ConstantPointerNull::get(Int32PtrTy)}); + Builder.CreateBr(IfEndBB); + Builder.SetInsertPoint(IfElseBB); + + auto *Switch = Builder.CreateSwitch(Kind, IfEndBB); + // Create global variable registration code. + Builder.SetInsertPoint(SwGlobalBB); + Builder.CreateCall(RegVar, + {RegGlobalsFn->arg_begin(), Addr, Name, Name, Extern, Size, + Const, ConstantInt::get(Type::getInt32Ty(C), 0)}); + Builder.CreateBr(IfEndBB); + Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalEntry), + SwGlobalBB); + + // Create managed variable registration code. + Builder.SetInsertPoint(SwManagedBB); + Builder.CreateBr(IfEndBB); + Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalManagedEntry), + SwManagedBB); + // Create surface variable registration code. + Builder.SetInsertPoint(SwSurfaceBB); + if (EmitSurfacesAndTextures) + Builder.CreateCall(RegSurface, {RegGlobalsFn->arg_begin(), Addr, Name, Name, + Data, Extern}); + Builder.CreateBr(IfEndBB); + Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalSurfaceEntry), + SwSurfaceBB); + + // Create texture variable registration code. + Builder.SetInsertPoint(SwTextureBB); + if (EmitSurfacesAndTextures) + Builder.CreateCall(RegTexture, {RegGlobalsFn->arg_begin(), Addr, Name, Name, + Data, Normalized, Extern}); + Builder.CreateBr(IfEndBB); + Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalTextureEntry), + SwTextureBB); + + Builder.SetInsertPoint(IfEndBB); + auto *NewEntry = Builder.CreateInBoundsGEP( + offloading::getEntryTy(M), Entry, ConstantInt::get(getSizeTTy(M), 1)); + auto *Cmp = Builder.CreateICmpEQ( + NewEntry, + ConstantExpr::getInBoundsGetElementPtr( + ArrayType::get(offloading::getEntryTy(M), 0), EntriesE, + ArrayRef({ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(getSizeTTy(M), 0)}))); + Entry->addIncoming( + ConstantExpr::getInBoundsGetElementPtr( + ArrayType::get(offloading::getEntryTy(M), 0), EntriesB, + ArrayRef({ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(getSizeTTy(M), 0)})), + &RegGlobalsFn->getEntryBlock()); + Entry->addIncoming(NewEntry, IfEndBB); + Builder.CreateCondBr(Cmp, ExitBB, EntryBB); + Builder.SetInsertPoint(ExitBB); + Builder.CreateRetVoid(); + + return RegGlobalsFn; +} + +// Create the constructor and destructor to register the fatbinary with the CUDA +// runtime. +void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc, + bool IsHIP, EntryArrayTy EntryArray, + StringRef Suffix, + bool EmitSurfacesAndTextures) { + LLVMContext &C = M.getContext(); + auto *CtorFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *CtorFunc = Function::Create( + CtorFuncTy, GlobalValue::InternalLinkage, + (IsHIP ? ".hip.fatbin_reg" : ".cuda.fatbin_reg") + Suffix, &M); + CtorFunc->setSection(".text.startup"); + + auto *DtorFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *DtorFunc = Function::Create( + DtorFuncTy, GlobalValue::InternalLinkage, + (IsHIP ? ".hip.fatbin_unreg" : ".cuda.fatbin_unreg") + Suffix, &M); + DtorFunc->setSection(".text.startup"); + + auto *PtrTy = PointerType::getUnqual(C); + + // Get the __cudaRegisterFatBinary function declaration. + auto *RegFatTy = FunctionType::get(PtrTy, PtrTy, /*isVarArg=*/false); + FunctionCallee RegFatbin = M.getOrInsertFunction( + IsHIP ? "__hipRegisterFatBinary" : "__cudaRegisterFatBinary", RegFatTy); + // Get the __cudaRegisterFatBinaryEnd function declaration. + auto *RegFatEndTy = + FunctionType::get(Type::getVoidTy(C), PtrTy, /*isVarArg=*/false); + FunctionCallee RegFatbinEnd = + M.getOrInsertFunction("__cudaRegisterFatBinaryEnd", RegFatEndTy); + // Get the __cudaUnregisterFatBinary function declaration. + auto *UnregFatTy = + FunctionType::get(Type::getVoidTy(C), PtrTy, /*isVarArg=*/false); + FunctionCallee UnregFatbin = M.getOrInsertFunction( + IsHIP ? "__hipUnregisterFatBinary" : "__cudaUnregisterFatBinary", + UnregFatTy); + + auto *AtExitTy = + FunctionType::get(Type::getInt32Ty(C), PtrTy, /*isVarArg=*/false); + FunctionCallee AtExit = M.getOrInsertFunction("atexit", AtExitTy); + + auto *BinaryHandleGlobal = new llvm::GlobalVariable( + M, PtrTy, false, llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(PtrTy), + (IsHIP ? ".hip.binary_handle" : ".cuda.binary_handle") + Suffix); + + // Create the constructor to register this image with the runtime. + IRBuilder<> CtorBuilder(BasicBlock::Create(C, "entry", CtorFunc)); + CallInst *Handle = CtorBuilder.CreateCall( + RegFatbin, + ConstantExpr::getPointerBitCastOrAddrSpaceCast(FatbinDesc, PtrTy)); + CtorBuilder.CreateAlignedStore( + Handle, BinaryHandleGlobal, + Align(M.getDataLayout().getPointerTypeSize(PtrTy))); + CtorBuilder.CreateCall(createRegisterGlobalsFunction(M, IsHIP, EntryArray, + Suffix, + EmitSurfacesAndTextures), + Handle); + if (!IsHIP) + CtorBuilder.CreateCall(RegFatbinEnd, Handle); + CtorBuilder.CreateCall(AtExit, DtorFunc); + CtorBuilder.CreateRetVoid(); + + // Create the destructor to unregister the image with the runtime. We cannot + // use a standard global destructor after CUDA 9.2 so this must be called by + // `atexit()` intead. + IRBuilder<> DtorBuilder(BasicBlock::Create(C, "entry", DtorFunc)); + LoadInst *BinaryHandle = DtorBuilder.CreateAlignedLoad( + PtrTy, BinaryHandleGlobal, + Align(M.getDataLayout().getPointerTypeSize(PtrTy))); + DtorBuilder.CreateCall(UnregFatbin, BinaryHandle); + DtorBuilder.CreateRetVoid(); + + // Add this function to constructors. + appendToGlobalCtors(M, CtorFunc, /*Priority*/ 1); +} +} // namespace + +Error offloading::wrapOpenMPBinaries(Module &M, ArrayRef> Images, + EntryArrayTy EntryArray, + llvm::StringRef Suffix) { + GlobalVariable *Desc = createBinDesc(M, Images, EntryArray, Suffix); + if (!Desc) + return createStringError(inconvertibleErrorCode(), + "No binary descriptors created."); + createRegisterFunction(M, Desc, Suffix); + createUnregisterFunction(M, Desc, Suffix); + return Error::success(); +} + +Error offloading::wrapCudaBinary(Module &M, ArrayRef Image, + EntryArrayTy EntryArray, + llvm::StringRef Suffix, + bool EmitSurfacesAndTextures) { + GlobalVariable *Desc = createFatbinDesc(M, Image, /*IsHip=*/false, Suffix); + if (!Desc) + return createStringError(inconvertibleErrorCode(), + "No fatbin section created."); + + createRegisterFatbinFunction(M, Desc, /*IsHip=*/false, EntryArray, Suffix, + EmitSurfacesAndTextures); + return Error::success(); +} + +Error offloading::wrapHIPBinary(Module &M, ArrayRef Image, + EntryArrayTy EntryArray, llvm::StringRef Suffix, + bool EmitSurfacesAndTextures) { + GlobalVariable *Desc = createFatbinDesc(M, Image, /*IsHip=*/true, Suffix); + if (!Desc) + return createStringError(inconvertibleErrorCode(), + "No fatbin section created."); + + createRegisterFatbinFunction(M, Desc, /*IsHip=*/true, EntryArray, Suffix, + EmitSurfacesAndTextures); + return Error::success(); +} diff --git a/llvm/lib/Frontend/Offloading/Utility.cpp b/llvm/lib/Frontend/Offloading/Utility.cpp index 25f6095..531919b 100644 --- a/llvm/lib/Frontend/Offloading/Utility.cpp +++ b/llvm/lib/Frontend/Offloading/Utility.cpp @@ -1,4 +1,4 @@ -//===- Utility.cpp ------ Collection of geneirc offloading utilities ------===// +//===- Utility.cpp ------ Collection of generic offloading utilities ------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -28,11 +28,10 @@ StructType *offloading::getEntryTy(Module &M) { } // TODO: Rework this interface to be more generic. -void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, - uint64_t Size, int32_t Flags, int32_t Data, - StringRef SectionName) { - llvm::Triple Triple(M.getTargetTriple()); - +std::pair +offloading::getOffloadingEntryInitializer(Module &M, Constant *Addr, + StringRef Name, uint64_t Size, + int32_t Flags, int32_t Data) { Type *Int8PtrTy = PointerType::getUnqual(M.getContext()); Type *Int32Ty = Type::getInt32Ty(M.getContext()); Type *SizeTy = M.getDataLayout().getIntPtrType(M.getContext()); @@ -54,6 +53,16 @@ void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, ConstantInt::get(Int32Ty, Data), }; Constant *EntryInitializer = ConstantStruct::get(getEntryTy(M), EntryData); + return {EntryInitializer, Str}; +} + +void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, + uint64_t Size, int32_t Flags, int32_t Data, + StringRef SectionName) { + llvm::Triple Triple(M.getTargetTriple()); + + auto [EntryInitializer, NameGV] = + getOffloadingEntryInitializer(M, Addr, Name, Size, Flags, Data); auto *Entry = new GlobalVariable( M, getEntryTy(M), -- cgit v1.1