//===-- lib/cuda/memory.cpp -------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Runtime/CUDA/memory.h"
#include "flang-rt/runtime/assign-impl.h"
#include "flang-rt/runtime/descriptor.h"
#include "flang-rt/runtime/environment.h"
#include "flang-rt/runtime/terminator.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/CUDA/memmove-function.h"
#include "flang/Runtime/assign.h"

#include "cuda_runtime.h"

namespace Fortran::runtime::cuda {

extern "C" {

void *RTDEF(CUFMemAlloc)(
    std::size_t bytes, unsigned type, const char *sourceFile, int sourceLine) {
  void *ptr = nullptr;
  bytes = bytes ? bytes : 1;
  if (type == kMemTypeDevice) {
    if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
      CUDA_REPORT_IF_ERROR(
          cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal));
    } else {
      CUDA_REPORT_IF_ERROR(cudaMalloc((void **)&ptr, bytes));
    }
  } else if (type == kMemTypeManaged || type == kMemTypeUnified) {
    CUDA_REPORT_IF_ERROR(
        cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal));
  } else if (type == kMemTypePinned) {
    CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&ptr, bytes));
  } else {
    Terminator terminator{sourceFile, sourceLine};
    terminator.Crash("unsupported memory type");
  }
  return ptr;
}

void RTDEF(CUFMemFree)(
    void *ptr, unsigned type, const char *sourceFile, int sourceLine) {
  if (!ptr)
    return;
  if (type == kMemTypeDevice || type == kMemTypeManaged ||
      type == kMemTypeUnified) {
    CUDA_REPORT_IF_ERROR(cudaFree(ptr));
  } else if (type == kMemTypePinned) {
    CUDA_REPORT_IF_ERROR(cudaFreeHost(ptr));
  } else {
    Terminator terminator{sourceFile, sourceLine};
    terminator.Crash("unsupported memory type");
  }
}

void RTDEF(CUFMemsetDescriptor)(
    Descriptor *desc, void *value, const char *sourceFile, int sourceLine) {
  Terminator terminator{sourceFile, sourceLine};
  terminator.Crash("not yet implemented: CUDA data transfer from a scalar "
                   "value to a descriptor");
}

void RTDEF(CUFDataTransferPtrPtr)(void *dst, void *src, std::size_t bytes,
    unsigned mode, const char *sourceFile, int sourceLine) {
  cudaMemcpyKind kind;
  if (mode == kHostToDevice) {
    kind = cudaMemcpyHostToDevice;
  } else if (mode == kDeviceToHost) {
    kind = cudaMemcpyDeviceToHost;
  } else if (mode == kDeviceToDevice) {
    kind = cudaMemcpyDeviceToDevice;
  } else {
    Terminator terminator{sourceFile, sourceLine};
    terminator.Crash("host to host copy not supported");
  }
  // TODO: Use cudaMemcpyAsync when we have support for stream.
  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, bytes, kind));
}

void RTDEF(CUFDataTransferPtrDesc)(void *addr, Descriptor *desc,
    std::size_t bytes, unsigned mode, const char *sourceFile, int sourceLine) {
  Terminator terminator{sourceFile, sourceLine};
  terminator.Crash(
      "not yet implemented: CUDA data transfer from a descriptor to a pointer");
}

void RTDECL(CUFDataTransferDescDesc)(Descriptor *dstDesc, Descriptor *srcDesc,
    unsigned mode, const char *sourceFile, int sourceLine) {
  MemmoveFct memmoveFct;
  Terminator terminator{sourceFile, sourceLine};
  if (mode == kHostToDevice) {
    memmoveFct = &MemmoveHostToDevice;
  } else if (mode == kDeviceToHost) {
    memmoveFct = &MemmoveDeviceToHost;
  } else if (mode == kDeviceToDevice) {
    memmoveFct = &MemmoveDeviceToDevice;
  } else {
    terminator.Crash("host to host copy not supported");
  }
  // Allocate dst descriptor if not allocated.
  if (!dstDesc->IsAllocated()) {
    dstDesc->ApplyMold(*srcDesc, dstDesc->rank());
    dstDesc->Allocate(/*asyncObject=*/nullptr);
  }
  if ((srcDesc->rank() > 0) && (dstDesc->Elements() <= srcDesc->Elements()) &&
      srcDesc->IsContiguous() && dstDesc->IsContiguous()) {
    // Special case when rhs is bigger than lhs and both are contiguous arrays.
    // In this case we do a simple ptr to ptr transfer with the size of lhs.
    // This is be allowed in the reference compiler and it avoids error
    // triggered in the Assign runtime function used for the main case below.
    RTNAME(CUFDataTransferPtrPtr)(dstDesc->raw().base_addr,
        srcDesc->raw().base_addr, dstDesc->Elements() * dstDesc->ElementBytes(),
        mode, sourceFile, sourceLine);
  } else {
    Fortran::runtime::Assign(
        *dstDesc, *srcDesc, terminator, MaybeReallocate, memmoveFct);
  }
}

void RTDECL(CUFDataTransferCstDesc)(Descriptor *dstDesc, Descriptor *srcDesc,
    unsigned mode, const char *sourceFile, int sourceLine) {
  MemmoveFct memmoveFct;
  Terminator terminator{sourceFile, sourceLine};
  if (mode == kHostToDevice) {
    memmoveFct = &MemmoveHostToDevice;
  } else if (mode == kDeviceToHost) {
    memmoveFct = &MemmoveDeviceToHost;
  } else if (mode == kDeviceToDevice) {
    memmoveFct = &MemmoveDeviceToDevice;
  } else {
    terminator.Crash("host to host copy not supported");
  }

  Fortran::runtime::DoFromSourceAssign(
      *dstDesc, *srcDesc, terminator, memmoveFct);
}

void RTDECL(CUFDataTransferDescDescNoRealloc)(Descriptor *dstDesc,
    Descriptor *srcDesc, unsigned mode, const char *sourceFile,
    int sourceLine) {
  MemmoveFct memmoveFct;
  Terminator terminator{sourceFile, sourceLine};
  if (mode == kHostToDevice) {
    memmoveFct = &MemmoveHostToDevice;
  } else if (mode == kDeviceToHost) {
    memmoveFct = &MemmoveDeviceToHost;
  } else if (mode == kDeviceToDevice) {
    memmoveFct = &MemmoveDeviceToDevice;
  } else {
    terminator.Crash("host to host copy not supported");
  }
  Fortran::runtime::Assign(
      *dstDesc, *srcDesc, terminator, NoAssignFlags, memmoveFct);
}

void RTDECL(CUFDataTransferGlobalDescDesc)(Descriptor *dstDesc,
    Descriptor *srcDesc, unsigned mode, const char *sourceFile,
    int sourceLine) {
  RTNAME(CUFDataTransferDescDesc)
  (dstDesc, srcDesc, mode, sourceFile, sourceLine);
  if ((mode == kHostToDevice) || (mode == kDeviceToDevice)) {
    void *deviceAddr{
        RTNAME(CUFGetDeviceAddress)((void *)dstDesc, sourceFile, sourceLine)};
    RTNAME(CUFDescriptorSync)
    ((Descriptor *)deviceAddr, dstDesc, sourceFile, sourceLine);
  }
}
}
} // namespace Fortran::runtime::cuda