//===-- lib/cuda/allocator.cpp ----------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "flang/Runtime/CUDA/allocator.h" #include "flang-rt/runtime/allocator-registry.h" #include "flang-rt/runtime/derived.h" #include "flang-rt/runtime/descriptor.h" #include "flang-rt/runtime/environment.h" #include "flang-rt/runtime/lock.h" #include "flang-rt/runtime/stat.h" #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/type-info.h" #include "flang/Common/ISO_Fortran_binding_wrapper.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Support/Fortran.h" #include "cuda_runtime.h" namespace Fortran::runtime::cuda { struct DeviceAllocation { void *ptr; std::size_t size; cudaStream_t stream; }; // Compare address values. nullptr will be sorted at the end of the array. int compareDeviceAlloc(const void *a, const void *b) { const DeviceAllocation *deva = (const DeviceAllocation *)a; const DeviceAllocation *devb = (const DeviceAllocation *)b; if (deva->ptr == nullptr && devb->ptr == nullptr) return 0; if (deva->ptr == nullptr) return 1; if (devb->ptr == nullptr) return -1; return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0); } // Dynamic array for tracking asynchronous allocations. static DeviceAllocation *deviceAllocations = nullptr; Lock lock; static int maxDeviceAllocations{512}; // Initial size static int numDeviceAllocations{0}; static constexpr int allocNotFound{-1}; static void initAllocations() { if (!deviceAllocations) { deviceAllocations = static_cast( malloc(maxDeviceAllocations * sizeof(DeviceAllocation))); if (!deviceAllocations) { Terminator terminator{__FILE__, __LINE__}; terminator.Crash("Failed to allocate tracking array"); } } } static void doubleAllocationArray() { unsigned newSize = maxDeviceAllocations * 2; DeviceAllocation *newArray = static_cast( realloc(deviceAllocations, newSize * sizeof(DeviceAllocation))); if (!newArray) { Terminator terminator{__FILE__, __LINE__}; terminator.Crash("Failed to reallocate tracking array"); } deviceAllocations = newArray; maxDeviceAllocations = newSize; } static unsigned findAllocation(void *ptr) { if (numDeviceAllocations == 0) { return allocNotFound; } int left{0}; int right{numDeviceAllocations - 1}; if (left == right) { return left; } while (left <= right) { int mid = left + (right - left) / 2; if (deviceAllocations[mid].ptr == ptr) { return mid; } if (deviceAllocations[mid].ptr < ptr) { left = mid + 1; } else { right = mid - 1; } } return allocNotFound; } static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) { CriticalSection critical{lock}; initAllocations(); if (numDeviceAllocations >= maxDeviceAllocations) { doubleAllocationArray(); } deviceAllocations[numDeviceAllocations].ptr = ptr; deviceAllocations[numDeviceAllocations].size = size; deviceAllocations[numDeviceAllocations].stream = stream; ++numDeviceAllocations; qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation), compareDeviceAlloc); } static void eraseAllocation(int pos) { deviceAllocations[pos].ptr = nullptr; deviceAllocations[pos].size = 0; deviceAllocations[pos].stream = (cudaStream_t)0; qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation), compareDeviceAlloc); --numDeviceAllocations; } extern "C" { void RTDEF(CUFRegisterAllocator)() { allocatorRegistry.Register( kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned}); allocatorRegistry.Register( kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice}); allocatorRegistry.Register( kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged}); allocatorRegistry.Register( kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified}); } } void *CUFAllocPinned( std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) { void *p; CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes)); return p; } void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); } void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) { void *p; if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) { CUDA_REPORT_IF_ERROR( cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); } else { if (asyncObject == nullptr) { CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); } else { CUDA_REPORT_IF_ERROR( cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject)); insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject); } } return p; } void CUFFreeDevice(void *p) { CriticalSection critical{lock}; int pos = findAllocation(p); if (pos >= 0) { cudaStream_t stream = deviceAllocations[pos].stream; eraseAllocation(pos); CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream)); } else { CUDA_REPORT_IF_ERROR(cudaFree(p)); } } void *CUFAllocManaged( std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) { void *p; CUDA_REPORT_IF_ERROR( cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); return reinterpret_cast(p); } void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } void *CUFAllocUnified( std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) { // Call alloc managed for the time being. return CUFAllocManaged(sizeInBytes, asyncObject); } void CUFFreeUnified(void *p) { // Call free managed for the time being. CUFFreeManaged(p); } } // namespace Fortran::runtime::cuda