aboutsummaryrefslogtreecommitdiff
path: root/liboffloadmic/runtime/offload_host.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'liboffloadmic/runtime/offload_host.cpp')
-rw-r--r--liboffloadmic/runtime/offload_host.cpp2246
1 files changed, 1861 insertions, 385 deletions
diff --git a/liboffloadmic/runtime/offload_host.cpp b/liboffloadmic/runtime/offload_host.cpp
index 23a873f..08f626f 100644
--- a/liboffloadmic/runtime/offload_host.cpp
+++ b/liboffloadmic/runtime/offload_host.cpp
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2014 Intel Corporation. All Rights Reserved.
+ Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -28,7 +28,8 @@
*/
-// Forward declaration as the following 2 functions are declared as friend in offload_engine.h
+// Forward declaration as the following 2 functions are declared as friend
+// in offload_engine.h.
// CLANG does not like static to been after friend declaration.
static void __offload_init_library_once(void);
static void __offload_fini_library(void);
@@ -63,6 +64,55 @@ static void __offload_fini_library(void);
#define GET_OFFLOAD_NUMBER(timer_data) \
timer_data? timer_data->offload_number : 0
+extern "C" {
+#ifdef TARGET_WINNT
+// Windows does not support imports from libraries without actually
+// including them as dependence. We don't want to include in the
+// dependence since is it used only for Fortran when traceback is enabled.
+// Chose to implement it with GetProcAddress.
+#define FORTRAN_TRACE_BACK win_for__continue_traceback
+int win_for__continue_traceback( _Offload_result coi_offload_result )
+{
+ HINSTANCE hDLL;
+ int (* TraceBackRoutine)(_Offload_result value);
+
+ hDLL = LoadLibrary("libifcoremd.dll");
+ if (hDLL != 0) {
+ TraceBackRoutine = (int (*)(_Offload_result)) GetProcAddress(hDLL,
+ "for__continue_traceback");
+ if (TraceBackRoutine != 0) {
+ return TraceBackRoutine(coi_offload_result);
+ }
+ else {
+ OFFLOAD_TRACE(3,
+ "Cannot find for__continue_traceback routine in libifcorert.dll\n");
+ exit(1);
+ }
+ }
+ else {
+ OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
+ exit(1);
+ }
+ return 0;
+}
+
+#else // TARGET_WINNT
+
+#define FORTRAN_TRACE_BACK for__continue_traceback
+
+// for__continue_traceback is provided as a dummy to resolve link time symbols
+// for C/C++ programs. For Fortran the actual fortran library function in
+// libifcore.so is used.
+#pragma weak for__continue_traceback
+int for__continue_traceback( _Offload_result coi_offload_result )
+{
+ OFFLOAD_TRACE(3,
+ "liboffload function for_continue_traceback should not be called.\n");
+ exit(1);
+}
+#endif //TARGET_WINNT
+} // extern "C"
+
#ifdef TARGET_WINNT
// Small subset of ELF declarations for Windows which is needed to compile
// this file. ELF header is used to understand what binary type is contained
@@ -104,7 +154,16 @@ int offload_number = 0;
static const char *htrace_envname = "H_TRACE";
static const char *offload_report_envname = "OFFLOAD_REPORT";
-static char *timer_envname = "H_TIME";
+static const char *timer_envname = "H_TIME";
+
+// location of offload_main executable
+// To be used if the main application has no offload and is not built
+// with -offload but dynamic library linked in has offload pragma
+char* mic_device_main = 0;
+
+// DMA channel count used by COI and set via
+// OFFLOAD_DMA_CHANNEL_COUNT environment variable
+uint32_t mic_dma_channel_count;
// Trace information
static const char* vardesc_direction_as_string[] = {
@@ -146,6 +205,13 @@ uint32_t mic_stack_size = 12 * 1024 * 1024;
// MIC_BUFFERSIZE
uint64_t mic_buffer_size = 0;
+// Preallocated 4K page memory size for buffers on MIC
+uint64_t mic_4k_buffer_size = 0;
+
+// Preallocated 2M page memory size for buffers on MIC
+uint64_t mic_2m_buffer_size = 0;
+
+
// MIC_LD_LIBRARY_PATH
char* mic_library_path = 0;
@@ -183,6 +249,15 @@ static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
int __omp_device_num = 0;
static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
+//OFFLOAD_PARALLEL_COPY
+static bool __offload_parallel_copy = false;
+static const char *parallel_copy_envname = "OFFLOAD_PARALLEL_COPY";
+
+//Use COI interface for noncontiguous transfer if it exists.
+static bool __offload_use_coi_noncontiguous_transfer = false;
+static const char *use_coi_noncontiguous_transfer_envname =
+ "MIC_USE_COI_MULTI_D";
+
// The list of pending target libraries
static bool __target_libs;
static TargetImageList __target_libs_list;
@@ -192,6 +267,112 @@ static mutex_t stack_alloc_lock;
// Target executable
TargetImage* __target_exe;
+// Print readable offload flags
+static void trace_offload_flags(
+ OffloadHostTimerData* timer_data,
+ OffloadFlags offload_flags
+)
+{
+ // Sized big enough for all flag names
+ char fbuffer[256];
+ bool first = true;
+ if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
+ sprintf(fbuffer, " OffloadFlags=(");
+ if (offload_flags.bits.fortran_traceback) {
+ sprintf(fbuffer+strlen(fbuffer), "fortran_traceback");
+ first = false;
+ }
+ if (offload_flags.bits.omp_async) {
+ sprintf(fbuffer+strlen(fbuffer), first ? "omp_async" : ",omp_async");
+ first = false;
+ }
+ OFFLOAD_DEBUG_TRACE_1(1,
+ GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
+ "%s)\n", fbuffer);
+ }
+}
+
+// Print readable varDesc flags
+static void trace_varDesc_flags(
+ OffloadHostTimerData* timer_data,
+ varDescFlags offload_flags
+)
+{
+ // SIzed big enough for all flag names
+ char fbuffer[256];
+ bool first = true;
+ if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
+ sprintf(fbuffer, " varDescFlags=(");
+ if (offload_flags.is_static) {
+ sprintf(fbuffer+strlen(fbuffer), "is_static");
+ first = false;
+ }
+ if (offload_flags.is_static_dstn) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "is_static_dstn" : ",is_static_dstn");
+ first = false;
+ }
+ if (offload_flags.has_length) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "has_length" : ",has_length");
+ first = false;
+ }
+ if (offload_flags.is_stack_buf) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "is_stack_buf" : ",is_stack_buf");
+ first = false;
+ }
+ if (offload_flags.targetptr) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "targetptr" : ",targetptr");
+ first = false;
+ }
+ if (offload_flags.preallocated) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "preallocated" : ",preallocated");
+ first = false;
+ }
+ if (offload_flags.is_pointer) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "is_pointer" : ",is_pointer");
+ first = false;
+ }
+ if (offload_flags.sink_addr) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "sink_addr" : ",sink_addr");
+ first = false;
+ }
+ if (offload_flags.alloc_disp) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "alloc_disp" : ",alloc_disp");
+ first = false;
+ }
+ if (offload_flags.is_noncont_src) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "is_noncont_src" : ",is_noncont_src");
+ first = false;
+ }
+ if (offload_flags.is_noncont_dst) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "is_noncont_dst" : ",is_noncont_dst");
+ first = false;
+ }
+ if (offload_flags.always_copy) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "always_copy" : ",always_copy");
+ first = false;
+ }
+ if (offload_flags.always_delete) {
+ sprintf(fbuffer+strlen(fbuffer),
+ first ? "always_delete" : ",always_delete");
+ first = false;
+ }
+ OFFLOAD_DEBUG_TRACE_1(1,
+ GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
+ "%s)\n", fbuffer);
+ }
+}
+
static char * offload_get_src_base(void * ptr, uint8_t type)
{
char *base;
@@ -204,7 +385,7 @@ static char * offload_get_src_base(void * ptr, uint8_t type)
else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
ArrDesc *dvp;
if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
- const arr_desc *ap = static_cast<const arr_desc*>(ptr);
+ const Arr_Desc *ap = static_cast<const Arr_Desc*>(ptr);
dvp = (type == c_dv_data_slice) ?
reinterpret_cast<ArrDesc*>(ap->base) :
*reinterpret_cast<ArrDesc**>(ap->base);
@@ -278,130 +459,228 @@ _Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
}
}
+// is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
+// is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
+// allocate memory at target; use its value as base in target table.
+// is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
+// base - is address at target of preallocated memory; use its value as
+// base in target table.
+
bool OffloadDescriptor::alloc_ptr_data(
PtrData* &ptr_data,
void *base,
int64_t disp,
int64_t size,
int64_t alloc_disp,
- int align
+ int align,
+ bool is_targptr,
+ bool is_prealloc,
+ bool pin
)
{
// total length of base
- int64_t length = disp + size;
+ int64_t length = size;
bool is_new;
+ COIBUFFER targptr_buf;
+ COIRESULT res;
+ uint32_t buffer_flags = 0;
+ char * base_disp = reinterpret_cast<char *>(base) + disp;
- OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
- base, length);
+ // create buffer with large pages if data length exceeds
+ // large page threshold
+ if (length >= __offload_use_2mb_buffers) {
+ buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
+ }
+ // Allocate memory at target for targetptr without preallocated as we need
+ // its address as base argument in call to m_device.insert_ptr_data
+ if (is_targptr && !is_prealloc) {
+ length = alloc_disp ? length : size + disp;
+ res = COI::BufferCreate(
+ length,
+ COI_BUFFER_NORMAL,
+ buffer_flags,
+ 0,
+ 1,
+ &m_device.get_process(),
+ &targptr_buf);
+ if (res != COI_SUCCESS) {
+ if (m_status != 0) {
+ m_status->result = translate_coi_error(res);
+ }
+ else if (m_is_mandatory) {
+ report_coi_error(c_buf_create, res);
+ }
+ return false;
+ }
+
+ res = COI::BufferGetSinkAddress(
+ targptr_buf, reinterpret_cast<uint64_t *>(&base));
+ if (res != COI_SUCCESS) {
+ if (m_status != 0) {
+ m_status->result = translate_coi_error(res);
+ }
+ else if (m_is_mandatory) {
+ report_coi_error(c_buf_get_address, res);
+ }
+ return false;
+ }
+ }
+ OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
+ alloc_disp ? base : base_disp,
+ alloc_disp ? length : size + disp);
+
// add new entry
- ptr_data = m_device.insert_ptr_data(base, length, is_new);
+
+ ptr_data = is_targptr ?
+ m_device.find_targetptr_data(base_disp) :
+ m_device.find_ptr_data(base_disp);
+ // if ptr_data is found just need to check it for overlapping
+ if (ptr_data) {
+ is_new = false;
+ base = base_disp;
+ }
+ else {
+ // If association is not found we must create it.
+ length = alloc_disp ? length : size + disp;
+ ptr_data = is_targptr ?
+ m_device.insert_targetptr_data(base, length, is_new) :
+ m_device.insert_ptr_data(base, length, is_new);
+ }
if (is_new) {
OFFLOAD_TRACE(3, "Added new association\n");
if (length > 0) {
OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
- COIRESULT res;
// align should be a power of 2
- if (align > 0 && (align & (align - 1)) == 0) {
+ if (!pin && !is_targptr &&
+ align > 0 && (align & (align - 1)) == 0) {
// offset within mic_buffer. Can do offset optimization
// only when source address alignment satisfies requested
// alignment on the target (cq172736).
if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
- ptr_data->mic_offset = reinterpret_cast<intptr_t>(base) & 4095;
+ ptr_data->mic_offset =
+ reinterpret_cast<intptr_t>(base) & 4095;
}
}
// buffer size and flags
uint64_t buffer_size = length + ptr_data->mic_offset;
- uint32_t buffer_flags = 0;
- // create buffer with large pages if data length exceeds
- // large page threshold
- if (length >= __offload_use_2mb_buffers) {
- buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
- }
-
- // create CPU buffer
- OFFLOAD_DEBUG_TRACE_1(3,
+ // For targetptr there is no CPU buffer
+ if (pin || !is_targptr) {
+ // create CPU buffer
+ OFFLOAD_DEBUG_TRACE_1(3,
GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_create_buf_host,
"Creating buffer from source memory %p, "
"length %lld\n", base, length);
- // result is not checked because we can continue without cpu
- // buffer. In this case we will use COIBufferRead/Write instead
- // of COIBufferCopy.
- COI::BufferCreateFromMemory(length,
+ // result is not checked because we can continue without cpu
+ // buffer. In this case we will use COIBufferRead/Write
+ // instead of COIBufferCopy.
+
+ COI::BufferCreateFromMemory(length,
COI_BUFFER_NORMAL,
0,
base,
1,
&m_device.get_process(),
&ptr_data->cpu_buf);
+ }
- OFFLOAD_DEBUG_TRACE_1(3,
+ // create MIC buffer
+ if (is_prealloc) {
+ OFFLOAD_DEBUG_TRACE_1(3,
GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_create_buf_mic,
- "Creating buffer for sink: size %lld, offset %d, "
- "flags =0x%x\n", buffer_size - alloc_disp,
+ "Creating buffer from sink memory: size %lld, offset %d, "
+ "flags =0x%x\n", buffer_size,
ptr_data->mic_offset, buffer_flags);
-
- // create MIC buffer
- res = COI::BufferCreate(buffer_size - alloc_disp,
- COI_BUFFER_NORMAL,
- buffer_flags,
- 0,
- 1,
- &m_device.get_process(),
- &ptr_data->mic_buf);
- if (res != COI_SUCCESS) {
- if (m_status != 0) {
- m_status->result = translate_coi_error(res);
- }
- else if (m_is_mandatory) {
- report_coi_error(c_buf_create, res);
+ res = COI::BufferCreateFromMemory(ptr_data->cpu_addr.length(),
+ COI_BUFFER_NORMAL,
+ COI_SINK_MEMORY,
+ base,
+ 1,
+ &m_device.get_process(),
+ &ptr_data->mic_buf);
+ if (res != COI_SUCCESS) {
+ if (m_status != 0) {
+ m_status->result = translate_coi_error(res);
+ }
+ else if (m_is_mandatory) {
+ report_coi_error(c_buf_create, res);
+ }
+ ptr_data->alloc_ptr_data_lock.unlock();
+ return false;
}
- ptr_data->alloc_ptr_data_lock.unlock();
- return false;
}
-
- // make buffer valid on the device.
- res = COI::BufferSetState(ptr_data->mic_buf,
- m_device.get_process(),
- COI_BUFFER_VALID,
- COI_BUFFER_NO_MOVE,
- 0, 0, 0);
- if (res != COI_SUCCESS) {
- if (m_status != 0) {
- m_status->result = translate_coi_error(res);
- }
- else if (m_is_mandatory) {
- report_coi_error(c_buf_set_state, res);
+ else if (is_targptr) {
+ ptr_data->mic_buf = targptr_buf;
+ }
+ else if (!pin) {
+ OFFLOAD_DEBUG_TRACE_1(3,
+ GET_OFFLOAD_NUMBER(get_timer_data()),
+ c_offload_create_buf_mic,
+ "Creating buffer for sink: size %lld, offset %d, "
+ "flags =0x%x\n", buffer_size,
+ ptr_data->mic_offset, buffer_flags);
+ res = COI::BufferCreate(buffer_size,
+ COI_BUFFER_NORMAL,
+ buffer_flags,
+ 0,
+ 1,
+ &m_device.get_process(),
+ &ptr_data->mic_buf);
+ if (res != COI_SUCCESS) {
+ if (m_status != 0) {
+ m_status->result = translate_coi_error(res);
+ }
+ else if (m_is_mandatory) {
+ report_coi_error(c_buf_create, res);
+ }
+ ptr_data->alloc_ptr_data_lock.unlock();
+ return false;
}
- ptr_data->alloc_ptr_data_lock.unlock();
- return false;
}
- res = COI::BufferSetState(ptr_data->mic_buf,
- COI_PROCESS_SOURCE,
- COI_BUFFER_INVALID,
- COI_BUFFER_NO_MOVE,
- 0, 0, 0);
- if (res != COI_SUCCESS) {
- if (m_status != 0) {
- m_status->result = translate_coi_error(res);
+ if (!pin) {
+ // make buffer valid on the device.
+ res = COI::BufferSetState(ptr_data->mic_buf,
+ m_device.get_process(),
+ COI_BUFFER_VALID,
+ COI_BUFFER_NO_MOVE,
+ 0, 0, 0);
+ if (res != COI_SUCCESS) {
+ if (m_status != 0) {
+ m_status->result = translate_coi_error(res);
+ }
+ else if (m_is_mandatory) {
+ report_coi_error(c_buf_set_state, res);
+ }
+ ptr_data->alloc_ptr_data_lock.unlock();
+ return false;
}
- else if (m_is_mandatory) {
- report_coi_error(c_buf_set_state, res);
+
+ res = COI::BufferSetState(ptr_data->mic_buf,
+ COI_PROCESS_SOURCE,
+ COI_BUFFER_INVALID,
+ COI_BUFFER_NO_MOVE,
+ 0, 0, 0);
+ if (res != COI_SUCCESS) {
+ if (m_status != 0) {
+ m_status->result = translate_coi_error(res);
+ }
+ else if (m_is_mandatory) {
+ report_coi_error(c_buf_set_state, res);
+ }
+ ptr_data->alloc_ptr_data_lock.unlock();
+ return false;
}
- ptr_data->alloc_ptr_data_lock.unlock();
- return false;
}
}
-
ptr_data->alloc_disp = alloc_disp;
ptr_data->alloc_ptr_data_lock.unlock();
}
@@ -415,9 +694,11 @@ bool OffloadDescriptor::alloc_ptr_data(
// This is not a new entry. Make sure that provided address range fits
// into existing one.
- MemRange addr_range(base, length - ptr_data->alloc_disp);
+ MemRange addr_range(base, length);
if (!ptr_data->cpu_addr.contains(addr_range)) {
- LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
+ LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc, base, length,
+ const_cast<void *>(ptr_data->cpu_addr.start()),
+ ptr_data->cpu_addr.length());
exit(1);
}
@@ -433,20 +714,24 @@ bool OffloadDescriptor::alloc_ptr_data(
bool OffloadDescriptor::find_ptr_data(
PtrData* &ptr_data,
- void *base,
+ void *in_base,
int64_t disp,
int64_t size,
+ bool is_targetptr,
bool report_error
)
{
// total length of base
- int64_t length = disp + size;
-
+ int64_t length = size;
+ char *base = reinterpret_cast<char *>(in_base) + disp;
+
OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
"length %lld\n", base, length);
// find existing association in pointer table
- ptr_data = m_device.find_ptr_data(base);
+ ptr_data = is_targetptr ?
+ m_device.find_targetptr_data(base) :
+ m_device.find_ptr_data(base);
if (ptr_data == 0) {
if (report_error) {
LIBOFFLOAD_ERROR(c_no_ptr_data, base);
@@ -464,7 +749,9 @@ bool OffloadDescriptor::find_ptr_data(
MemRange addr_range(base, length);
if (!ptr_data->cpu_addr.contains(addr_range)) {
if (report_error) {
- LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
+ LIBOFFLOAD_ERROR(c_bad_ptr_mem_range, base, length,
+ const_cast<void *>(ptr_data->cpu_addr.start()),
+ ptr_data->cpu_addr.length());
exit(1);
}
OFFLOAD_TRACE(3, "Existing association partially overlaps with "
@@ -591,6 +878,7 @@ bool OffloadDescriptor::offload_stack_memory_manager(
PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
PersistDataList::iterator it_end;
int erase = 0;
+ uint64_t cur_thread_id = m_device.get_thread_id();
*is_new = false;
@@ -600,9 +888,11 @@ bool OffloadDescriptor::offload_stack_memory_manager(
if (stack_begin > it->stack_cpu_addr) {
// this stack data must be destroyed
- m_destroy_stack.push_front(cur_el.stack_ptr_data);
- it_end = it;
- erase++;
+ if (cur_thread_id == cur_el.thread_id) {
+ m_destroy_stack.push_front(cur_el.stack_ptr_data);
+ it_end = it;
+ erase++;
+ }
}
else if (stack_begin == it->stack_cpu_addr) {
if (routine_id != it-> routine_id) {
@@ -627,7 +917,8 @@ bool OffloadDescriptor::offload_stack_memory_manager(
return true;
}
}
- else if (stack_begin < it->stack_cpu_addr) {
+ else if (stack_begin < it->stack_cpu_addr &&
+ cur_thread_id == cur_el.thread_id) {
break;
}
}
@@ -638,7 +929,7 @@ bool OffloadDescriptor::offload_stack_memory_manager(
m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
}
// new stack table is created
- new_el = new PersistData(stack_begin, routine_id, buf_size);
+ new_el = new PersistData(stack_begin, routine_id, buf_size, cur_thread_id);
// create MIC buffer
COIRESULT res;
uint32_t buffer_flags = 0;
@@ -733,11 +1024,13 @@ bool OffloadDescriptor::setup_descriptors(
}
// dependencies
- m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * (m_vars_total + 1));
+ m_in_deps_allocated = m_vars_total + 1;
+ m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_in_deps_allocated);
if (m_in_deps == NULL)
LIBOFFLOAD_ERROR(c_malloc);
if (m_vars_total > 0) {
- m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_vars_total);
+ m_out_deps_allocated = m_vars_total;
+ m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_out_deps_allocated);
if (m_out_deps == NULL)
LIBOFFLOAD_ERROR(c_malloc);
}
@@ -752,7 +1045,7 @@ bool OffloadDescriptor::setup_descriptors(
for (int i = 0; i < m_vars_total; i++) {
void* alloc_base = NULL;
int64_t alloc_disp = 0;
- int64_t alloc_size;
+ int64_t alloc_size = 0;
bool src_is_for_mic = (m_vars[i].direction.out ||
m_vars[i].into == NULL);
@@ -787,25 +1080,41 @@ bool OffloadDescriptor::setup_descriptors(
m_vars[i].count,
m_vars[i].ptr,
m_vars[i].into);
+ // If any varDesc flags bits set, show them
+ if (console_enabled >= 1 && m_vars[i].flags.bits != 0) {
+ trace_varDesc_flags(get_timer_data(), m_vars[i].flags);
+ }
+ // preallocated implies targetptr
+ if (m_vars[i].flags.preallocated) {
+ // targetptr preallocated alloc_if(1) may not be used with
+ // an in clause
+ if (m_vars[i].direction.in && m_vars[i].alloc_if) {
+ LIBOFFLOAD_ERROR(c_in_with_preallocated);
+ exit(1);
+ }
+ m_vars[i].flags.targetptr = 1;
+ }
if (m_vars[i].alloc != NULL) {
// array descriptor
- const arr_desc *ap =
- static_cast<const arr_desc*>(m_vars[i].alloc);
+ const Arr_Desc *ap =
+ static_cast<const Arr_Desc*>(m_vars[i].alloc);
// debug dump
- __arr_desc_dump(" ", "ALLOC", ap, 0);
+ ARRAY_DESC_DUMP(" ", "ALLOC", ap, 0, 1);
__arr_data_offset_and_length(ap, alloc_disp, alloc_size);
alloc_base = reinterpret_cast<void*>(ap->base);
}
+ m_vars_extra[i].alloc = m_vars[i].alloc;
m_vars_extra[i].cpu_disp = 0;
m_vars_extra[i].cpu_offset = 0;
m_vars_extra[i].src_data = 0;
m_vars_extra[i].read_rng_src = 0;
m_vars_extra[i].read_rng_dst = 0;
+ m_vars_extra[i].omp_last_event_type = c_last_not;
// flag is_arr_ptr_el is 1 only for var_descs generated
// for c_data_ptr_array type
if (i < vars_total) {
@@ -815,7 +1124,7 @@ bool OffloadDescriptor::setup_descriptors(
switch (m_vars[i].type.src) {
case c_data_ptr_array:
{
- const arr_desc *ap;
+ const Arr_Desc *ap;
const VarDesc3 *vd3 =
static_cast<const VarDesc3*>(m_vars[i].ptr);
int flags = vd3->array_fields;
@@ -824,32 +1133,33 @@ bool OffloadDescriptor::setup_descriptors(
OFFLOAD_TRACE(2,
" pointer array type is %s\n",
vardesc_type_as_string[flags & 0x3f]);
- ap = static_cast<const arr_desc*>(vd3->ptr_array);
- __arr_desc_dump(" ", "ptr array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
+ ARRAY_DESC_DUMP(" ", "ptr array", ap,
+ m_vars[i].flags.is_pointer, 1);
if (m_vars[i].into) {
- ap = static_cast<const arr_desc*>(m_vars[i].into);
- __arr_desc_dump(
- " ", "into array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(m_vars[i].into);
+ ARRAY_DESC_DUMP(
+ " ", "into array", ap, 0, 1);
}
if ((flags & (1<<flag_align_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->align_array);
- __arr_desc_dump(
- " ", "align array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->align_array);
+ ARRAY_DESC_DUMP(
+ " ", "align array", ap, 0, 1);
}
if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
- __arr_desc_dump(
- " ", "alloc_if array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
+ ARRAY_DESC_DUMP(
+ " ", "alloc_if array", ap, 0, 1);
}
if ((flags & (1<<flag_free_if_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->free_if_array);
- __arr_desc_dump(
- " ", "free_if array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
+ ARRAY_DESC_DUMP(
+ " ", "free_if array", ap, 0, 1);
}
if ((flags & (1<<flag_extent_start_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->extent_start);
- __arr_desc_dump(
- " ", "extent_start array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->extent_start);
+ ARRAY_DESC_DUMP(
+ " ", "extent_start array", ap, 0, 1);
} else if ((flags &
(1<<flag_extent_start_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
@@ -857,10 +1167,10 @@ bool OffloadDescriptor::setup_descriptors(
(int64_t)vd3->extent_start);
}
if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
- ap = static_cast<const arr_desc*>
+ ap = static_cast<const Arr_Desc*>
(vd3->extent_elements);
- __arr_desc_dump(
- " ", "extent_elements array", ap, 0);
+ ARRAY_DESC_DUMP(" ",
+ "extent_elements array", ap, 0, 1);
} else if ((flags &
(1<<flag_extent_elements_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
@@ -868,9 +1178,9 @@ bool OffloadDescriptor::setup_descriptors(
(int64_t)vd3->extent_elements);
}
if ((flags & (1<<flag_into_start_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->into_start);
- __arr_desc_dump(
- " ", "into_start array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->into_start);
+ ARRAY_DESC_DUMP(
+ " ", "into_start array", ap, 0, 1);
} else if ((flags &
(1<<flag_into_start_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
@@ -878,9 +1188,9 @@ bool OffloadDescriptor::setup_descriptors(
(int64_t)vd3->into_start);
}
if ((flags & (1<<flag_into_elements_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->into_elements);
- __arr_desc_dump(
- " ", "into_elements array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->into_elements);
+ ARRAY_DESC_DUMP(
+ " ", "into_elements array", ap, 0, 1);
} else if ((flags &
(1<<flag_into_elements_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
@@ -888,9 +1198,9 @@ bool OffloadDescriptor::setup_descriptors(
(int64_t)vd3->into_elements);
}
if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->alloc_start);
- __arr_desc_dump(
- " ", "alloc_start array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
+ ARRAY_DESC_DUMP(
+ " ", "alloc_start array", ap, 0, 1);
} else if ((flags &
(1<<flag_alloc_start_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
@@ -898,9 +1208,9 @@ bool OffloadDescriptor::setup_descriptors(
(int64_t)vd3->alloc_start);
}
if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->alloc_elements);
- __arr_desc_dump(
- " ", "alloc_elements array", ap, 0);
+ ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
+ ARRAY_DESC_DUMP(" ",
+ "alloc_elements array", ap, 0, 1);
} else if ((flags &
(1<<flag_alloc_elements_is_scalar)) != 0) {
OFFLOAD_TRACE(2,
@@ -922,11 +1232,11 @@ bool OffloadDescriptor::setup_descriptors(
// VarDesc.disp will have an offset from base
if (m_vars[i].type.src == c_cean_var) {
// array descriptor
- const arr_desc *ap =
- static_cast<const arr_desc*>(m_vars[i].ptr);
+ const Arr_Desc *ap =
+ static_cast<const Arr_Desc*>(m_vars[i].ptr);
// debug dump
- __arr_desc_dump("", "IN/OUT", ap, 0);
+ ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
// offset and length are derived from the array descriptor
__arr_data_offset_and_length(ap, m_vars[i].disp,
@@ -961,7 +1271,7 @@ bool OffloadDescriptor::setup_descriptors(
m_vars[i].ptr,
m_vars[i].disp,
m_vars[i].size,
- false)) {
+ false, false)) {
return false;
}
@@ -983,10 +1293,11 @@ bool OffloadDescriptor::setup_descriptors(
if (m_is_openmp) {
if (m_vars[i].flags.is_static) {
- // Static data is transferred only by omp target
+ // Static data is transferred either by omp target
// update construct which passes zeros for
- // alloc_if and free_if.
- if (m_vars[i].alloc_if || m_vars[i].free_if) {
+ // alloc_if and free_if or by always modifier.
+ if (!m_vars[i].flags.always_copy &&
+ (m_vars[i].alloc_if || m_vars[i].free_if)) {
m_vars[i].direction.bits = c_parameter_nocopy;
}
}
@@ -1004,10 +1315,12 @@ bool OffloadDescriptor::setup_descriptors(
m_vars[i].ptr);
}
- // For automatic variables data is transferred
- // only if alloc_if == 0 && free_if == 0
- // or reference count is 1
- if ((m_vars[i].alloc_if || m_vars[i].free_if) &&
+ // For automatic variables data is transferred:
+ // - if always modifier is used OR
+ // - if alloc_if == 0 && free_if == 0 OR
+ // - if reference count is 1
+ if (!m_vars[i].flags.always_copy &&
+ (m_vars[i].alloc_if || m_vars[i].free_if) &&
auto_data != 0 &&
auto_data->get_reference() != 1) {
m_vars[i].direction.bits = c_parameter_nocopy;
@@ -1088,8 +1401,12 @@ bool OffloadDescriptor::setup_descriptors(
}
m_vars[i].size = m_destroy_stack.size();
m_vars_extra[i].src_data = m_stack_ptr_data;
- // need to add reference for buffer
- m_need_runfunction = true;
+
+ // need to add or remove references for stack buffer at target
+ if (is_new || m_destroy_stack.size()) {
+ m_need_runfunction = true;
+ }
+
break;
}
/* fallthru */
@@ -1098,11 +1415,11 @@ bool OffloadDescriptor::setup_descriptors(
case c_dv_ptr:
if (m_vars[i].type.src == c_cean_var_ptr) {
// array descriptor
- const arr_desc *ap =
- static_cast<const arr_desc*>(m_vars[i].ptr);
+ const Arr_Desc *ap =
+ static_cast<const Arr_Desc*>(m_vars[i].ptr);
// debug dump
- __arr_desc_dump("", "IN/OUT", ap, 1);
+ ARRAY_DESC_DUMP("", "IN/OUT", ap, 1, !src_is_for_mic);
// offset and length are derived from the array descriptor
__arr_data_offset_and_length(ap, m_vars[i].disp,
@@ -1145,9 +1462,10 @@ bool OffloadDescriptor::setup_descriptors(
m_vars[i].free_if) {
PtrData *ptr_data;
- // check that buffer length >= 0
+ // check that buffer length > 0
if (m_vars[i].alloc_if &&
- m_vars[i].disp + m_vars[i].size < 0) {
+ m_vars[i].disp + m_vars[i].size <
+ (m_is_openmp ? 0 : 1)) {
LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
exit(1);
}
@@ -1166,20 +1484,34 @@ bool OffloadDescriptor::setup_descriptors(
m_vars[i].flags.sink_addr = 1;
}
else if (m_vars[i].alloc_if) {
+ if (m_vars[i].flags.preallocated) {
+ m_out_datalen += sizeof(void*);
+ m_need_runfunction = true;
+ break;
+ }
// add new entry
if (!alloc_ptr_data(
ptr_data,
- base,
+ reinterpret_cast<char *>(base) + alloc_disp,
(alloc_base != NULL) ?
alloc_disp : m_vars[i].disp,
(alloc_base != NULL) ?
alloc_size : m_vars[i].size,
alloc_disp,
(alloc_base != NULL) ?
- 0 : m_vars[i].align)) {
+ 0 : m_vars[i].align,
+ m_vars[i].flags.targetptr,
+ 0,
+ m_vars[i].flags.pin)) {
return false;
}
-
+ if (m_vars[i].flags.targetptr) {
+ if (!init_mic_address(ptr_data)) {
+ return false;
+ }
+ *static_cast<void**>(m_vars[i].ptr) = base =
+ reinterpret_cast<void*>(ptr_data->mic_addr);
+ }
if (ptr_data->add_reference() == 0 &&
ptr_data->mic_buf != 0) {
// add buffer to the list of buffers that
@@ -1187,12 +1519,14 @@ bool OffloadDescriptor::setup_descriptors(
m_compute_buffers.push_back(
ptr_data->mic_buf);
}
- else {
+ else if (!m_vars[i].flags.pin &&
+ !m_vars[i].flags.preallocated) {
// will send buffer address to device
m_vars[i].flags.sink_addr = 1;
}
- if (!ptr_data->is_static) {
+ if (!m_vars[i].flags.pin &&
+ !ptr_data->is_static) {
// need to add reference for buffer
m_need_runfunction = true;
}
@@ -1202,8 +1536,9 @@ bool OffloadDescriptor::setup_descriptors(
if (m_is_openmp) {
// For omp target update variable is ignored
// if it does not exist.
- if (!m_vars[i].alloc_if &&
- !m_vars[i].free_if) {
+ if (m_vars[i].flags.always_copy ||
+ (!m_vars[i].alloc_if &&
+ !m_vars[i].free_if)) {
error_if_not_found = false;
}
}
@@ -1213,6 +1548,7 @@ bool OffloadDescriptor::setup_descriptors(
base,
m_vars[i].disp,
m_vars[i].size,
+ m_vars[i].flags.targetptr,
error_if_not_found)) {
return false;
}
@@ -1235,9 +1571,10 @@ bool OffloadDescriptor::setup_descriptors(
// data is transferred only if
// alloc_if == 0 && free_if == 0
// or reference count is 1
- if ((m_vars[i].alloc_if ||
- m_vars[i].free_if) &&
- ptr_data->get_reference() != 1) {
+ if (!m_vars[i].flags.always_copy &&
+ ((m_vars[i].alloc_if ||
+ m_vars[i].free_if) &&
+ ptr_data->get_reference() != 1)) {
m_vars[i].direction.bits =
c_parameter_nocopy;
}
@@ -1257,7 +1594,8 @@ bool OffloadDescriptor::setup_descriptors(
m_in_datalen += sizeof(ptr_data->mic_addr);
}
- if (!ptr_data->is_static && m_vars[i].free_if) {
+ if (!m_vars[i].flags.pin &&
+ !ptr_data->is_static && m_vars[i].free_if) {
// need to decrement buffer reference on target
m_need_runfunction = true;
}
@@ -1277,7 +1615,7 @@ bool OffloadDescriptor::setup_descriptors(
base,
m_vars[i].disp,
m_vars[i].size,
- false)) {
+ false, false)) {
return false;
}
if (ptr_data) {
@@ -1308,8 +1646,8 @@ bool OffloadDescriptor::setup_descriptors(
case c_dv_ptr_data_slice:
ArrDesc *dvp;
if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
- const arr_desc *ap;
- ap = static_cast<const arr_desc*>(m_vars[i].ptr);
+ const Arr_Desc *ap;
+ ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
dvp = (m_vars[i].type.src == c_dv_data_slice) ?
reinterpret_cast<ArrDesc*>(ap->base) :
@@ -1331,13 +1669,13 @@ bool OffloadDescriptor::setup_descriptors(
if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
- const arr_desc *ap;
+ const Arr_Desc *ap;
if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
- ap = static_cast<const arr_desc*>(m_vars[i].ptr);
+ ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
// debug dump
- __arr_desc_dump("", "IN/OUT", ap, 0);
+ ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
}
if (!__dv_is_contiguous(dvp)) {
m_vars[i].flags.is_noncont_src = 1;
@@ -1393,14 +1731,17 @@ bool OffloadDescriptor::setup_descriptors(
// add new entry
if (!alloc_ptr_data(
ptr_data,
- base,
+ reinterpret_cast<char *>(base) + alloc_disp,
(alloc_base != NULL) ?
alloc_disp : m_vars[i].disp,
(alloc_base != NULL) ?
alloc_size : m_vars[i].size,
alloc_disp,
(alloc_base != NULL) ?
- 0 : m_vars[i].align)) {
+ 0 : m_vars[i].align,
+ m_vars[i].flags.targetptr,
+ m_vars[i].flags.preallocated,
+ m_vars[i].flags.pin)) {
return false;
}
@@ -1426,8 +1767,9 @@ bool OffloadDescriptor::setup_descriptors(
if (m_is_openmp) {
// For omp target update variable is ignored
// if it does not exist.
- if (!m_vars[i].alloc_if &&
- !m_vars[i].free_if) {
+ if (m_vars[i].flags.always_copy ||
+ (!m_vars[i].alloc_if &&
+ !m_vars[i].free_if)) {
error_if_not_found = false;
}
}
@@ -1437,6 +1779,7 @@ bool OffloadDescriptor::setup_descriptors(
base,
m_vars[i].disp,
m_vars[i].size,
+ m_vars[i].flags.targetptr,
error_if_not_found)) {
return false;
}
@@ -1457,10 +1800,12 @@ bool OffloadDescriptor::setup_descriptors(
if (ptr_data != 0) {
if (m_is_openmp) {
- // data is transferred only if
- // alloc_if == 0 && free_if == 0
- // or reference count is 1
- if ((m_vars[i].alloc_if ||
+ // data is transferred if
+ // - if always modifier is used OR
+ // - if alloc_if == 0 && free_if == 0 OR
+ // - if reference count is 1
+ if (!m_vars[i].flags.always_copy &&
+ (m_vars[i].alloc_if ||
m_vars[i].free_if) &&
ptr_data->get_reference() != 1) {
m_vars[i].direction.bits =
@@ -1503,7 +1848,7 @@ bool OffloadDescriptor::setup_descriptors(
base,
m_vars[i].disp,
m_vars[i].size,
- false)) {
+ false, false)) {
return false;
}
m_vars[i].offset = !ptr_data ? 0 :
@@ -1551,11 +1896,11 @@ bool OffloadDescriptor::setup_descriptors(
if (m_vars[i].type.dst == c_cean_var) {
// array descriptor
- const arr_desc *ap =
- static_cast<const arr_desc*>(m_vars[i].into);
+ const Arr_Desc *ap =
+ static_cast<const Arr_Desc*>(m_vars[i].into);
// debug dump
- __arr_desc_dump(" ", "INTO", ap, 0);
+ ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
// offset and length are derived from the array descriptor
__arr_data_offset_and_length(ap, into_disp, size);
@@ -1594,7 +1939,7 @@ bool OffloadDescriptor::setup_descriptors(
// find data associated with variable
if (!find_ptr_data(ptr_data, m_vars[i].into,
- into_disp, size, false)) {
+ into_disp, size, false, false)) {
return false;
}
if (ptr_data != 0) {
@@ -1648,11 +1993,11 @@ bool OffloadDescriptor::setup_descriptors(
if (m_vars[i].type.dst == c_cean_var_ptr) {
// array descriptor
- const arr_desc *ap =
- static_cast<const arr_desc*>(m_vars[i].into);
+ const Arr_Desc *ap =
+ static_cast<const Arr_Desc*>(m_vars[i].into);
// debug dump
- __arr_desc_dump(" ", "INTO", ap, 1);
+ ARRAY_DESC_DUMP(" ", "INTO", ap, 1, src_is_for_mic);
// offset and length are derived from the array descriptor
__arr_data_offset_and_length(ap, into_disp, size);
@@ -1713,20 +2058,34 @@ bool OffloadDescriptor::setup_descriptors(
m_vars[i].flags.sink_addr = 1;
}
else if (m_vars[i].alloc_if) {
+ if (m_vars[i].flags.preallocated) {
+ m_out_datalen += sizeof(void*);
+ m_need_runfunction = true;
+ break;
+ }
// add new entry
if (!alloc_ptr_data(
ptr_data,
- base,
+ reinterpret_cast<char *>(base) + alloc_disp,
(alloc_base != NULL) ?
alloc_disp : into_disp,
(alloc_base != NULL) ?
alloc_size : size,
alloc_disp,
(alloc_base != NULL) ?
- 0 : m_vars[i].align)) {
+ 0 : m_vars[i].align,
+ m_vars[i].flags.targetptr,
+ m_vars[i].flags.preallocated,
+ m_vars[i].flags.pin)) {
return false;
}
-
+ if (m_vars[i].flags.targetptr) {
+ if (!init_mic_address(ptr_data)) {
+ return false;
+ }
+ *static_cast<void**>(m_vars[i].into) = base =
+ reinterpret_cast<void*>(ptr_data->mic_addr);
+ }
if (ptr_data->add_reference() == 0 &&
ptr_data->mic_buf != 0) {
// add buffer to the list of buffers that
@@ -1746,7 +2105,8 @@ bool OffloadDescriptor::setup_descriptors(
}
else {
// use existing association from pointer table
- if (!find_ptr_data(ptr_data, base, into_disp, size)) {
+ if (!find_ptr_data(ptr_data, base, into_disp,
+ size, m_vars[i].flags.targetptr, true)) {
return false;
}
m_vars[i].flags.sink_addr = 1;
@@ -1780,7 +2140,7 @@ bool OffloadDescriptor::setup_descriptors(
base,
into_disp,
m_vars[i].size,
- false)) {
+ false, false)) {
return false;
}
}
@@ -1806,17 +2166,17 @@ bool OffloadDescriptor::setup_descriptors(
if (m_vars[i].direction.bits ||
m_vars[i].alloc_if ||
m_vars[i].free_if) {
- const arr_desc *ap;
+ const Arr_Desc *ap;
ArrDesc *dvp;
PtrData *ptr_data;
int64_t disp;
int64_t size;
if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
- ap = static_cast<const arr_desc*>(m_vars[i].into);
+ ap = static_cast<const Arr_Desc*>(m_vars[i].into);
// debug dump
- __arr_desc_dump(" ", "INTO", ap, 0);
+ ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
dvp = (m_vars[i].type.dst == c_dv_data_slice) ?
reinterpret_cast<ArrDesc*>(ap->base) :
@@ -1889,14 +2249,17 @@ bool OffloadDescriptor::setup_descriptors(
// add new entry
if (!alloc_ptr_data(
ptr_data,
- base,
+ reinterpret_cast<char *>(base) + alloc_disp,
(alloc_base != NULL) ?
alloc_disp : into_disp,
(alloc_base != NULL) ?
alloc_size : size,
alloc_disp,
(alloc_base != NULL) ?
- 0 : m_vars[i].align)) {
+ 0 : m_vars[i].align,
+ m_vars[i].flags.targetptr,
+ m_vars[i].flags.preallocated,
+ m_vars[i].flags.pin)) {
return false;
}
if (ptr_data->add_reference() == 0 &&
@@ -1918,7 +2281,8 @@ bool OffloadDescriptor::setup_descriptors(
}
else {
// use existing association from pointer table
- if (!find_ptr_data(ptr_data, base, into_disp, size)) {
+ if (!find_ptr_data(ptr_data, base, into_disp,
+ size, m_vars[i].flags.targetptr, true)) {
return false;
}
@@ -1958,7 +2322,7 @@ bool OffloadDescriptor::setup_descriptors(
base,
into_disp,
size,
- false)) {
+ false, false)) {
return false;
}
into_offset = !ptr_data ?
@@ -2062,9 +2426,10 @@ bool OffloadDescriptor::setup_misc_data(const char *name)
if (m_func_desc == NULL)
LIBOFFLOAD_ERROR(c_malloc);
m_func_desc->console_enabled = console_enabled;
- m_func_desc->timer_enabled =
- timer_enabled || (offload_report_level && offload_report_enabled);
- m_func_desc->offload_report_level = offload_report_level;
+ m_func_desc->timer_enabled = offload_report_enabled &&
+ (timer_enabled || offload_report_level);
+ m_func_desc->offload_report_level = offload_report_enabled ?
+ offload_report_level : 0;
m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
m_func_desc->in_datalen = m_in_datalen;
m_func_desc->out_datalen = m_out_datalen;
@@ -2078,35 +2443,193 @@ bool OffloadDescriptor::setup_misc_data(const char *name)
return true;
}
+void OffloadDescriptor::setup_omp_async_info()
+{
+ OFFLOAD_TRACE(2, "setup_omp_async_info\n");
+ OmpAsyncLastEventType event_type = m_need_runfunction ?
+ c_last_runfunc : c_last_write;
+ int last_in = m_need_runfunction ? 0 : -1;
+ int i;
+
+ for (i = m_vars_total - 1; i >=0; i--) {
+ switch (m_vars[i].type.dst) {
+ case c_data:
+ case c_void_ptr:
+ case c_cean_var:
+ if (m_vars[i].direction.out &&
+ m_vars[i].flags.is_static_dstn) {
+ event_type = c_last_read;
+ }
+ else if (last_in < 0 && m_vars[i].direction.in &&
+ m_vars[i].flags.is_static_dstn) {
+ last_in = i;
+ }
+ break;
+ case c_string_ptr:
+ case c_data_ptr:
+ case c_cean_var_ptr:
+ case c_dv_ptr:
+ case c_dv_data:
+ case c_dv_ptr_data:
+ case c_dv_data_slice:
+ case c_dv_ptr_data_slice:
+
+ if (m_vars[i].direction.out) {
+ event_type = c_last_read;
+ }
+ else if (last_in < 0 && m_vars[i].direction.in) {
+ last_in = i;
+ }
+ break;
+ default:
+ break;
+ }
+ if (event_type == c_last_read) {
+ break;
+ }
+ }
+
+ if (event_type == c_last_read) {
+ m_vars_extra[i].omp_last_event_type = c_last_read;
+ }
+ else if (event_type == c_last_write) {
+ m_vars_extra[last_in].omp_last_event_type = c_last_write;
+ }
+ m_omp_async_last_event_type = event_type;
+ OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n",
+ m_omp_async_last_event_type);
+}
+
+extern "C" {
+ void offload_proxy_task_completed_ooo(
+ COIEVENT e,
+ const COIRESULT r,
+ const void *info
+ )
+ {
+ /* TODO: Call callback function, pass info. */
+ }
+}
+
+void OffloadDescriptor::register_omp_event_call_back(
+ const COIEVENT *event,
+ const void *info)
+{
+ OFFLOAD_TRACE(2, "register_omp_event_call_back(event=%p, info=%p)\n",
+ event, info);
+ if (COI::EventRegisterCallback) {
+ COI::EventRegisterCallback(
+ *event,
+ &offload_proxy_task_completed_ooo,
+ info, 0);
+ OFFLOAD_TRACE(2,
+ "COI::EventRegisterCallback found; callback registered\n");
+ }
+}
+
bool OffloadDescriptor::wait_dependencies(
- const void **waits,
- int num_waits
+ const void **waits,
+ int num_waits,
+ _Offload_stream handle
)
{
OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
bool ret = true;
+ OffloadDescriptor *task;
+ if (num_waits == 0) {
+ return true;
+ }
- for (int i = 0; i < num_waits; i++) {
+ // wait for streams
+ if (num_waits == -1) {
+ Stream * stream;
+ // some specific stream of the device
+ if (handle != 0) {
+ stream = Stream::find_stream(handle, false);
- OffloadDescriptor *task = m_device.find_signal(waits[i], true);
- if (task == 0) {
- LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
- waits[i]);
- LIBOFFLOAD_ABORT;
- }
+ // the stream was not created or was destroyed
+ if (!stream) {
+ LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
+ LIBOFFLOAD_ABORT;
+ }
+ task = stream->get_last_offload();
- if (!task->offload_finish()) {
- ret = false;
+ // offload was completed by previous offload_wait pragma
+ // or wait clause
+ if (task == 0) {
+ return true;
+ }
+ if (!task->offload_finish(0)) { //arg is 0 for is_traceback
+ ret = false;
+ }
+ task->cleanup();
+ stream->set_last_offload(NULL);
+ delete task;
}
+ // all streams of the device or over all devices
+ else {
+ StreamMap stream_map = Stream::all_streams;
+ for (StreamMap::iterator it = stream_map.begin();
+ it != stream_map.end(); it++) {
+ Stream * stream = it->second;
- task->cleanup();
- delete task;
- }
+ if (!m_wait_all_devices &&
+ stream->get_device() != m_device.get_logical_index()) {
+ continue;
+ }
+ // get associated async task
+ OffloadDescriptor *task = stream->get_last_offload();
+ // offload was completed by offload_wait pragma or wait clause
+ if (task == 0) {
+ continue;
+ }
+ if (!task->offload_finish(0)) { //arg is 0 for is_traceback
+ ret = false;
+ }
+ task->cleanup();
+ stream->set_last_offload(NULL);
+ delete task;
+ }
+ // no uncompleted streams
+ return true;
+ }
+ }
+ else {
+ // if handle is equal to no_stream it's wait for signals
+ for (int i = 0; i < num_waits; i++) {
+ _Offload_stream stream_handle;
+ Stream *stream;
+ task = m_device.find_signal(waits[i], true);
+ if (task == 0) {
+ LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
+ waits[i]);
+ LIBOFFLOAD_ABORT;
+ }
+ else if (task == SIGNAL_IS_REMOVED) {
+ continue;
+ }
+ if (!task->offload_finish(0)) { //arg is 0 for is_traceback
+ ret = false;
+ }
+ task->cleanup();
+ // if the offload both has signal and is last offload of its
+ // stream, we must wipe out the "last_offload" reference as
+ // the offload already is finished.
+ stream_handle = task->m_stream;
+ if (stream_handle != -1) {
+ stream = Stream::find_stream(stream_handle, false);
+ if (stream && stream->get_last_offload() == task) {
+ stream->set_last_offload(NULL);
+ }
+ }
+ delete task;
+ }
+ }
return ret;
}
-bool OffloadDescriptor::offload(
+bool OffloadDescriptor::offload_wrap(
const char *name,
bool is_empty,
VarDesc *vars,
@@ -2116,19 +2639,73 @@ bool OffloadDescriptor::offload(
int num_waits,
const void **signal,
int entry_id,
- const void *stack_addr
+ const void *stack_addr,
+ OffloadFlags offload_flags
)
{
+ OffloadWaitKind wait_kind = c_offload_wait_signal;
+ bool is_traceback = offload_flags.bits.fortran_traceback;
+
+ // define kind of wait if any;
+ // there can be one off the following kind:
+ // 1. c_offload_wait_signal for "offload_wait wait(signal)"
+ // 2. c_offload_wait_stream for "offload_wait stream(stream)"
+ // 3. c_offload_wait_all_streams for "offload_wait stream(0)"
+ if (num_waits == -1) {
+ wait_kind = (m_stream == 0) ?
+ c_offload_wait_all_streams :
+ c_offload_wait_stream;
+ }
+ char buf[35];
+ const char *stream_str;
+
+ if (m_stream == no_stream || num_waits >= 0) {
+ stream_str = "none";
+ }
+ else if (m_stream == 0) {
+ stream_str = "all";
+ }
+ else {
+ sprintf(buf, "%#llx", m_stream);
+ stream_str = buf;
+ }
+
if (signal == 0) {
OFFLOAD_DEBUG_TRACE_1(1,
GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_init_func,
"Offload function %s, is_empty=%d, #varDescs=%d, "
- "#waits=%d, signal=none\n",
- name, is_empty, vars_total, num_waits);
- OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
- c_offload_sent_pointer_data,
- "#Wait : %d \n", num_waits);
+ "signal=none, stream=%s, #waits=%d%c",
+ name, is_empty, vars_total, stream_str, num_waits,
+ num_waits == 0 ? '\n' : ' ');
+ // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
+ // since the number of waits is not fixed.
+ if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
+ if (num_waits) {
+ printf("(");
+ if (m_stream == no_stream) {
+ printf("%p", waits[0]);
+ for (int i = 1; i < num_waits; i++) {
+ printf(", %p", waits[i]);
+ }
+ }
+ else if (m_stream != 0) {
+ printf("%#x", m_stream);
+ }
+ else {
+ printf(" all streams");
+ }
+ printf(")");
+ }
+ printf("\n");
+ fflush(NULL);
+ }
+ // stream in wait is reported further in OFFLOAD_REPORT for waits
+ if (m_stream != no_stream && num_waits == 0) {
+ OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
+ c_offload_stream,
+ "%d\n", m_stream);
+ }
OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_signal,
"none %d\n", 0);
@@ -2138,27 +2715,62 @@ bool OffloadDescriptor::offload(
GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_init_func,
"Offload function %s, is_empty=%d, #varDescs=%d, "
- "#waits=%d, signal=%p\n",
- name, is_empty, vars_total, num_waits,
- *signal);
-
+ "signal=%p, stream=%s, #waits=%d%c",
+ name, is_empty, vars_total, *signal, stream_str, num_waits,
+ num_waits == 0 ? '\n' : ' ');
+ // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
+ // since the number of waits is not fixed.
+ if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
+ if (num_waits) {
+ printf("(");
+ if (m_stream == no_stream) {
+ printf("%p", waits[0]);
+ for (int i = 1; i < num_waits; i++) {
+ printf(", %p", waits[i]);
+ }
+ printf(")");
+ }
+ else if (m_stream != 0) {
+ printf("%#x", m_stream);
+ }
+ else {
+ printf(" all streams");
+ }
+ printf(")");
+ }
+ printf("\n");
+ fflush(NULL);
+ }
+ // stream in wait is reported further in OFFLOAD_REPORT for waits
+ if (m_stream != no_stream && num_waits == 0) {
+ OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
+ c_offload_stream,
+ "%d\n", m_stream);
+ }
OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
c_offload_signal,
"%d\n", signal);
}
+ if (console_enabled >= 1 && offload_flags.flags != 0) {
+ trace_offload_flags(get_timer_data(), offload_flags);
+ }
+
OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
- c_offload_wait,
- "#Wait : %d %p\n", num_waits, waits);
+ c_offload_wait, "%d\n",
+ wait_kind, num_waits,
+ (wait_kind == c_offload_wait_signal) ?
+ waits :
+ reinterpret_cast<const void **>(m_stream));
if (m_status != 0) {
m_status->result = OFFLOAD_SUCCESS;
m_status->device_number = m_device.get_logical_index();
}
- m_need_runfunction = !is_empty;
+ m_initial_need_runfunction = m_need_runfunction = !is_empty;
// wait for dependencies to finish
- if (!wait_dependencies(waits, num_waits)) {
+ if (!wait_dependencies(waits, num_waits, m_stream)) {
cleanup();
return false;
}
@@ -2169,8 +2781,13 @@ bool OffloadDescriptor::offload(
return false;
}
+ if (offload_flags.bits.omp_async) {
+ setup_omp_async_info();
+ }
+
// initiate send for pointers. Want to do it as early as possible.
- if (!send_pointer_data(signal != 0)) {
+ if (!send_pointer_data(signal != 0 || offload_flags.bits.omp_async,
+ signal)) {
cleanup();
return false;
}
@@ -2188,25 +2805,46 @@ bool OffloadDescriptor::offload(
}
// Start the computation
- if (!compute()) {
+ if (!compute(signal)) {
cleanup();
return false;
}
// initiate receive for pointers
- if (!receive_pointer_data(signal != 0)) {
+ if (!receive_pointer_data(signal != 0 || offload_flags.bits.omp_async,
+ true, signal)) {
cleanup();
return false;
}
-
- // if there is a signal save descriptor for the later use.
- if (signal != 0) {
- m_device.add_signal(*signal, this);
+ if (offload_flags.bits.omp_async) {
return true;
}
+ // if there is a signal or stream save descriptor for the later use.
+ // num_waits == -1 is for offload_wait and there is nothing to save
+ if (num_waits != -1 && (signal != 0 || m_stream != no_stream)) {
+ if (signal != 0) {
+ m_device.add_signal(*signal, this);
+ }
+
+ if (m_stream != no_stream && m_stream != 0) {
+ Stream* stream = Stream::find_stream(m_stream, false);
+ if (stream) {
+ stream->set_last_offload(this);
+ }
+ else {
+ LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
+ LIBOFFLOAD_ABORT;
+ }
+ }
+ // if there is a clause with alloc_if(1) and preallocated need to call
+ // offload_finish after runfunction
+ if (!m_preallocated_alloc) {
+ return true;
+ }
+ }
// wait for the offload to finish.
- if (!offload_finish()) {
+ if (!offload_finish(is_traceback)) {
cleanup();
return false;
}
@@ -2215,7 +2853,38 @@ bool OffloadDescriptor::offload(
return true;
}
-bool OffloadDescriptor::offload_finish()
+bool OffloadDescriptor::offload(
+ const char *name,
+ bool is_empty,
+ VarDesc *vars,
+ VarDesc2 *vars2,
+ int vars_total,
+ const void **waits,
+ int num_waits,
+ const void **signal,
+ int entry_id,
+ const void *stack_addr,
+ OffloadFlags offload_flags
+)
+{
+ bool res;
+ res = offload_wrap(name, is_empty, vars, vars2, vars_total,
+ waits, num_waits, signal, entry_id,
+ stack_addr, offload_flags);
+ if (res == false && !m_traceback_called) {
+ if (offload_flags.bits.fortran_traceback) {
+ OFFLOAD_TRACE(3,
+ "Calling Fortran library to continue traceback from MIC\n");
+ FORTRAN_TRACE_BACK(m_status->result);
+ m_traceback_called = true;
+ }
+ }
+ return res;
+}
+
+bool OffloadDescriptor::offload_finish(
+ bool is_traceback
+)
{
COIRESULT res;
@@ -2235,10 +2904,24 @@ bool OffloadDescriptor::offload_finish()
}
if (res != COI_SUCCESS) {
- if (m_status != 0) {
+ if (m_status != 0 && !m_traceback_called) {
m_status->result = translate_coi_error(res);
+ if (is_traceback) {
+ OFFLOAD_TRACE(3,
+ "Calling Fortran library to continue traceback from MIC\n");
+ FORTRAN_TRACE_BACK(m_status->result);
+ m_traceback_called = true;
+ }
return false;
}
+
+ if (is_traceback && !m_traceback_called) {
+ OFFLOAD_TRACE(3,
+ "Calling Fortran library to continue traceback from MIC\n");
+ FORTRAN_TRACE_BACK(OFFLOAD_ERROR);
+ m_traceback_called = true;
+ }
+
report_coi_error(c_event_wait, res);
}
}
@@ -2247,6 +2930,13 @@ bool OffloadDescriptor::offload_finish()
if (!scatter_copyout_data()) {
return false;
}
+
+ if (m_out_with_preallocated &&
+ !receive_pointer_data(m_out_deps_total > 0, false, NULL)) {
+ cleanup();
+ return false;
+ }
+
// wait for receive dependencies to become signaled
if (m_out_deps_total > 0) {
OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
@@ -2320,24 +3010,50 @@ bool OffloadDescriptor::is_signaled()
return signaled;
}
+static Arr_Desc * make_arr_desc(
+ void* ptr_val,
+ int64_t extent_start_val,
+ int64_t extent_elements_val,
+ int64_t size
+)
+{
+ Arr_Desc *res;
+ res = (Arr_Desc *)malloc(sizeof(Arr_Desc));
+ if (res == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
+ res->base = reinterpret_cast<int64_t>(ptr_val);
+ res->rank = 1;
+ res->dim[0].size = size;
+ res->dim[0].lindex = 0;
+ res->dim[0].lower = extent_start_val;
+ res->dim[0].upper = extent_elements_val + extent_start_val - 1;
+ res->dim[0].stride = 1;
+ return res;
+}
+
// Send pointer data if source or destination or both of them are
// noncontiguous. There is guarantee that length of destination enough for
-// transfered data.
+// transferred data.
bool OffloadDescriptor::send_noncontiguous_pointer_data(
int i,
PtrData* src_data,
PtrData* dst_data,
- COIEVENT *event
+ COIEVENT *event,
+ uint64_t &data_sent,
+ uint32_t in_deps_amount,
+ COIEVENT *in_deps
)
{
int64_t offset_src, offset_dst;
int64_t length_src, length_dst;
int64_t length_src_cur, length_dst_cur;
- int64_t send_size, data_sent = 0;
+ int64_t send_size;
COIRESULT res;
bool dst_is_empty = true;
bool src_is_empty = true;
+ data_sent = 0;
+
// Set length_src and length_dst
length_src = (m_vars_extra[i].read_rng_src) ?
m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
@@ -2346,6 +3062,90 @@ bool OffloadDescriptor::send_noncontiguous_pointer_data(
m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
send_size = (length_src < length_dst) ? length_src : length_dst;
+ // If BufferWriteMultiD is defined we can set values of required arguments
+ // and transfer noncontiguous data via call to the COI routine.
+ if (__offload_use_coi_noncontiguous_transfer && COI::BufferWriteMultiD) {
+ struct Arr_Desc* arr_desc_dst;
+ struct Arr_Desc* arr_desc_src;
+ int64_t size_src, size_dst;
+ char *base = offload_get_src_base(static_cast<char*>(m_vars[i].ptr),
+ m_vars[i].type.src);
+ COIBUFFER dst_buf = m_vars[i].into ?
+ m_vars_extra[i].dst_data->mic_buf :
+ m_vars_extra[i].src_data->mic_buf;
+
+ offset_src = (m_vars_extra[i].read_rng_src)?
+ m_vars_extra[i].read_rng_src->init_offset : m_vars_extra[i].cpu_disp;
+ size_src = m_vars_extra[i].read_rng_src ?
+ cean_get_transf_size(m_vars_extra[i].read_rng_src) :
+ m_vars[i].size;
+
+ offset_dst = (m_vars_extra[i].read_rng_dst)?
+ m_vars_extra[i].read_rng_dst->init_offset : m_vars[i].disp;
+ size_dst = m_vars_extra[i].read_rng_dst ?
+ cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
+
+ int64_t el_size = (!m_vars[i].into ||
+ (m_vars_extra[i].read_rng_src && m_vars_extra[i].read_rng_dst)) ?
+ 1 :
+ m_vars_extra[i].read_rng_src ?
+ m_vars_extra[i].read_rng_src->arr_desc->dim[
+ m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
+ m_vars_extra[i].read_rng_dst->arr_desc->dim[
+ m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
+
+ arr_desc_src = (m_vars_extra[i].read_rng_src) ?
+ m_vars_extra[i].read_rng_src->arr_desc :
+ make_arr_desc(NULL, // don't required for source
+ offset_src/el_size, size_src/el_size, el_size);
+
+ arr_desc_dst = !m_vars[i].into ?
+ arr_desc_src :
+ (m_vars_extra[i].read_rng_dst) ?
+ m_vars_extra[i].read_rng_dst->arr_desc :
+ make_arr_desc(NULL,
+ offset_dst/el_size, size_src/el_size, el_size);
+
+ int64_t alloc_disp = m_vars[i].into ?
+ m_vars_extra[i].dst_data->alloc_disp :
+ m_vars_extra[i].src_data->alloc_disp;
+
+ arr_desc_src->base = reinterpret_cast<int64_t>(base);
+ arr_desc_dst->base = 0;
+
+ res = COI::BufferWriteMultiD(
+ dst_buf, // in_DestBuffer,
+ m_device.get_process(), // DestProcess,
+ m_vars[i].offset + m_vars[i].mic_offset -
+ alloc_disp, // Offset
+ (void*)arr_desc_dst, // descriptor of DestArray
+ (void*)arr_desc_src, // descriptor of SrcArray
+ COI_COPY_UNSPECIFIED, // Type
+ in_deps_amount, // Number of in Dependencies
+ in_deps, // array of in Dependencies
+ event); // out Dependency
+ if (res != COI_SUCCESS) {
+ if (m_status != 0) {
+ m_status->result = translate_coi_error(res);
+ return false;
+ }
+ report_coi_error(c_buf_copy, res);
+ }
+ return(true);
+ }
+
+ // if event is defined we must multiplate it for all contiguous intervals
+ // that will be Copied/Write.
+ // Take in account that we already have 1 event.
+ if (event) {
+ m_in_deps_allocated += (length_src / send_size) *
+ ((m_vars_extra[i].read_rng_src) ?
+ m_vars_extra[i].read_rng_src->range_max_number : 1) ;
+ m_in_deps =
+ (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * m_in_deps_allocated);
+ m_in_deps_total--;
+ }
+
// consequently get contiguous ranges,
// define corresponded destination offset and send data
do {
@@ -2402,17 +3202,20 @@ bool OffloadDescriptor::send_noncontiguous_pointer_data(
}
length_dst_cur -= send_size;
dst_is_empty = length_dst_cur == 0;
-
+
+ if (event) {
+ event = &m_in_deps[m_in_deps_total++];
+ }
if (src_data != 0 && src_data->cpu_buf != 0) {
res = COI::BufferCopy(
dst_data->mic_buf,
src_data->cpu_buf,
- m_vars[i].mic_offset - dst_data->alloc_disp +
+ m_vars[i].mic_offset +
m_vars[i].offset + offset_dst,
m_vars_extra[i].cpu_offset + offset_src,
send_size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2428,12 +3231,12 @@ bool OffloadDescriptor::send_noncontiguous_pointer_data(
res = COI::BufferWrite(
dst_data->mic_buf,
- m_vars[i].mic_offset - dst_data->alloc_disp +
+ m_vars[i].mic_offset +
m_vars[i].offset + offset_dst,
base + offset_src,
send_size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2443,21 +3246,87 @@ bool OffloadDescriptor::send_noncontiguous_pointer_data(
report_coi_error(c_buf_write, res);
}
}
- data_sent += length_src;
+ data_sent += send_size;
}
while (true);
return true;
}
-bool OffloadDescriptor::send_pointer_data(bool is_async)
+bool OffloadDescriptor::send_pointer_data(bool is_async, void* info)
{
OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
+ bool should_use_async_buffer_write = m_initial_need_runfunction;
uint64_t ptr_sent = 0;
COIRESULT res;
+ uint32_t in_deps_amount = 0;
+ COIEVENT *in_deps = NULL;
+
+ // For offload_transfer and offload with empty body without signal:
+ // - if there is only one buffer copy - send data synchronously
+ // - if there are multiple buffer copy and
+ // __offload_parallel_copy is false - send data synchronously
+ // - if there are multiple buffer copy and
+ // __offload_parallel_copy is true - send data asynchronously
+ // It concerns only big size data - greater than __offload_use_async_buffer_write.
+ // Data of size less than __offload_use_async_buffer_write are sent synchronously.
+ // Synchronous transfer results in better performance in COI.
+ // __offload_parallel_copy is false by default but can be changed
+ // via environment variable OFFLOAD_PARALLEL_COPY
+ if (!m_initial_need_runfunction && __offload_parallel_copy) {
+ int big_size_count = 0;
+ for (int i = 0; i < m_vars_total; i++) {
+ if (m_vars[i].direction.in &&
+ m_vars[i].size >= __offload_use_async_buffer_write) {
+ switch (m_vars[i].type.dst) {
+ case c_data:
+ case c_void_ptr:
+ case c_cean_var:
+ if (m_vars[i].flags.is_static_dstn) {
+ big_size_count++;
+ }
+ break;
+ case c_string_ptr:
+ case c_data_ptr:
+ case c_cean_var_ptr:
+ case c_dv_ptr:
+ case c_dv_data:
+ case c_dv_ptr_data:
+ case c_dv_data_slice:
+ case c_dv_ptr_data_slice:
+ big_size_count++;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (big_size_count > 1) {
+ should_use_async_buffer_write = true;
+ }
+ }
+
+ if (m_stream != no_stream && m_vars_total != 0) {
+ get_stream_in_dependencies(in_deps_amount, in_deps);
+ }
// Initiate send for pointer data
for (int i = 0; i < m_vars_total; i++) {
+ uint64_t sent_data = m_vars[i].size;
+ uint32_t in_deps_amount_save;
+ COIEVENT *in_deps_save;
+
+ if (m_vars_extra[i].omp_last_event_type == c_last_write) {
+ in_deps_amount_save = in_deps_amount;
+ in_deps_save = in_deps;
+ in_deps_amount = m_in_deps_total;
+ if (in_deps_amount > 0) {
+ in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * in_deps_amount);
+ if (in_deps == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
+ memcpy(in_deps, m_in_deps,in_deps_amount * sizeof(COIEVENT));
+ }
+ }
switch (m_vars[i].type.dst) {
case c_data_ptr_array:
break;
@@ -2468,7 +3337,8 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
m_vars[i].flags.is_static_dstn) {
COIEVENT *event =
(is_async ||
- m_vars[i].size >= __offload_use_async_buffer_write) ?
+ (should_use_async_buffer_write &&
+ m_vars[i].size >= __offload_use_async_buffer_write)) ?
&m_in_deps[m_in_deps_total++] : 0;
PtrData* dst_data = m_vars[i].into ?
m_vars_extra[i].dst_data :
@@ -2482,7 +3352,8 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
if (m_vars[i].flags.is_noncont_src ||
m_vars[i].flags.is_noncont_dst) {
if (!send_noncontiguous_pointer_data(
- i, src_data, dst_data, event)) {
+ i, src_data, dst_data, event, sent_data,
+ in_deps_amount, in_deps)) {
return false;
}
}
@@ -2490,13 +3361,13 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
res = COI::BufferCopy(
dst_data->mic_buf,
src_data->cpu_buf,
- m_vars[i].mic_offset - dst_data->alloc_disp +
+ m_vars[i].mic_offset +
m_vars[i].offset + m_vars[i].disp,
m_vars_extra[i].cpu_offset +
m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2511,12 +3382,12 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
m_vars[i].type.src);
res = COI::BufferWrite(
dst_data->mic_buf,
- m_vars[i].mic_offset - dst_data->alloc_disp +
+ m_vars[i].mic_offset +
m_vars[i].offset + m_vars[i].disp,
base + m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2526,7 +3397,7 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
report_coi_error(c_buf_write, res);
}
}
- ptr_sent += m_vars[i].size;
+ ptr_sent += sent_data;
}
break;
@@ -2537,7 +3408,8 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
if (m_vars[i].direction.in && m_vars[i].size > 0) {
COIEVENT *event =
(is_async ||
- m_vars[i].size >= __offload_use_async_buffer_write) ?
+ (should_use_async_buffer_write &&
+ m_vars[i].size >= __offload_use_async_buffer_write)) ?
&m_in_deps[m_in_deps_total++] : 0;
PtrData* dst_data = m_vars[i].into ?
m_vars_extra[i].dst_data :
@@ -2551,19 +3423,20 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
if (m_vars[i].flags.is_noncont_src ||
m_vars[i].flags.is_noncont_dst) {
send_noncontiguous_pointer_data(
- i, src_data, dst_data, event);
+ i, src_data, dst_data, event, sent_data,
+ in_deps_amount, in_deps);
}
else if (src_data != 0 && src_data->cpu_buf != 0) {
res = COI::BufferCopy(
dst_data->mic_buf,
src_data->cpu_buf,
- m_vars[i].mic_offset - dst_data->alloc_disp +
+ m_vars[i].mic_offset +
m_vars[i].offset + m_vars[i].disp,
m_vars_extra[i].cpu_offset +
m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2578,12 +3451,12 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
m_vars[i].type.src);
res = COI::BufferWrite(
dst_data->mic_buf,
- m_vars[i].mic_offset - dst_data->alloc_disp +
+ m_vars[i].mic_offset +
m_vars[i].offset + m_vars[i].disp,
base + m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2594,7 +3467,7 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
}
}
- ptr_sent += m_vars[i].size;
+ ptr_sent += sent_data;
}
break;
@@ -2609,26 +3482,27 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
COIEVENT *event =
(is_async ||
- m_vars[i].size >= __offload_use_async_buffer_write) ?
+ (should_use_async_buffer_write &&
+ m_vars[i].size >= __offload_use_async_buffer_write)) ?
&m_in_deps[m_in_deps_total++] : 0;
if (m_vars[i].flags.is_noncont_src ||
m_vars[i].flags.is_noncont_dst) {
send_noncontiguous_pointer_data(
- i, src_data, ptr_data, event);
+ i, src_data, ptr_data, event, sent_data,
+ in_deps_amount, in_deps);
}
else if (src_data && src_data->cpu_buf != 0) {
res = COI::BufferCopy(
ptr_data->mic_buf,
src_data->cpu_buf,
- m_vars[i].offset + ptr_data->mic_offset -
- ptr_data->alloc_disp +
+ m_vars[i].offset + ptr_data->mic_offset +
m_vars[i].disp,
m_vars_extra[i].cpu_offset +
m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2643,12 +3517,12 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
m_vars[i].type.src);
res = COI::BufferWrite(
ptr_data->mic_buf,
- ptr_data->mic_offset - ptr_data->alloc_disp +
+ ptr_data->mic_offset +
m_vars[i].offset + m_vars[i].disp,
base + m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2658,7 +3532,7 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
report_coi_error(c_buf_write, res);
}
}
- ptr_sent += m_vars[i].size;
+ ptr_sent += sent_data;
}
break;
@@ -2678,25 +3552,27 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
m_vars_extra[i].src_data : 0;
COIEVENT *event =
(is_async ||
- m_vars[i].size >= __offload_use_async_buffer_write) ?
+ (should_use_async_buffer_write &&
+ m_vars[i].size >= __offload_use_async_buffer_write)) ?
&m_in_deps[m_in_deps_total++] : 0;
if (m_vars[i].flags.is_noncont_src ||
m_vars[i].flags.is_noncont_dst) {
send_noncontiguous_pointer_data(
- i, src_data, dst_data, event);
+ i, src_data, dst_data, event, sent_data,
+ in_deps_amount, in_deps);
}
else if (src_data && src_data->cpu_buf != 0) {
res = COI::BufferCopy(
dst_data->mic_buf,
src_data->cpu_buf,
- m_vars[i].offset - dst_data->alloc_disp +
+ m_vars[i].offset +
dst_data->mic_offset +
m_vars[i].disp,
m_vars_extra[i].cpu_offset +
m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2711,12 +3587,12 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
m_vars[i].type.src);
res = COI::BufferWrite(
dst_data->mic_buf,
- dst_data->mic_offset - dst_data->alloc_disp +
+ dst_data->mic_offset +
m_vars[i].offset + m_vars[i].disp,
base + m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- 0, 0,
+ in_deps_amount, in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -2727,14 +3603,18 @@ bool OffloadDescriptor::send_pointer_data(bool is_async)
}
}
- ptr_sent += m_vars[i].size;
+ ptr_sent += sent_data;
}
break;
default:
break;
}
-
+ if (m_vars_extra[i].omp_last_event_type == c_last_write) {
+ in_deps_amount = in_deps_amount_save;
+ in_deps = in_deps_save;
+ register_omp_event_call_back(&m_in_deps[m_in_deps_total - 1], info);
+ }
// alloc field isn't used at target.
// We can reuse it for offset of array pointers.
if (m_vars_extra[i].is_arr_ptr_el) {
@@ -2901,7 +3781,7 @@ bool OffloadDescriptor::gather_copyin_data()
return true;
}
-bool OffloadDescriptor::compute()
+bool OffloadDescriptor::compute(void *info)
{
OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
@@ -2926,12 +3806,21 @@ bool OffloadDescriptor::compute()
// dispatch task
COIRESULT res;
COIEVENT event;
- res = m_device.compute(m_compute_buffers,
+ uint32_t in_deps_amount = m_in_deps_total;
+ COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0;
+
+ if (0 == m_in_deps_total && m_stream != no_stream) {
+ get_stream_in_dependencies(in_deps_amount, in_deps);
+ }
+
+ res = m_device.compute(m_stream,
+ m_compute_buffers,
misc, misc_len,
ret, ret_len,
- m_in_deps_total,
- m_in_deps_total > 0 ? m_in_deps : 0,
+ in_deps_amount,
+ in_deps,
&event);
+
if (res != COI_SUCCESS) {
if (m_status != 0) {
m_status->result = translate_coi_error(res);
@@ -2940,6 +3829,10 @@ bool OffloadDescriptor::compute()
report_coi_error(c_pipeline_run_func, res);
}
+ if (m_omp_async_last_event_type == c_last_runfunc) {
+ register_omp_event_call_back(&event, info);
+ }
+
m_in_deps_total = 1;
m_in_deps[0] = event;
}
@@ -2947,34 +3840,114 @@ bool OffloadDescriptor::compute()
return true;
}
-// recieve pointer data if source or destination or both of them are
+// receive pointer data if source or destination or both of them are
// noncontiguous. There is guarantee that length of destination enough for
-// transfered data.
-bool OffloadDescriptor::recieve_noncontiguous_pointer_data(
+// transferred data.
+bool OffloadDescriptor::receive_noncontiguous_pointer_data(
int i,
- char* base,
COIBUFFER dst_buf,
- COIEVENT *event
+ COIEVENT *event,
+ uint64_t &received_data,
+ uint32_t in_deps_amount,
+ COIEVENT *in_deps
)
{
int64_t offset_src, offset_dst;
int64_t length_src, length_dst;
int64_t length_src_cur, length_dst_cur;
- int64_t recieve_size, data_recieved = 0;
+ int64_t receive_size;
COIRESULT res;
bool dst_is_empty = true;
bool src_is_empty = true;
+ char *base = offload_get_src_base(
+ m_vars[i].into ?
+ static_cast<char*>(m_vars[i].into) :
+ static_cast<char*>(m_vars[i].ptr),
+ m_vars[i].type.dst);
+ received_data = 0;
+
// Set length_src and length_dst
length_src = (m_vars_extra[i].read_rng_src) ?
m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
length_dst = !m_vars[i].into ? length_src :
(m_vars_extra[i].read_rng_dst) ?
m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
- recieve_size = (length_src < length_dst) ? length_src : length_dst;
-
+ receive_size = (length_src < length_dst) ? length_src : length_dst;
+
+ // If BufferReadMultiD is defined we can set values of required arguments
+ // and transfer noncontiguous data via call to the COI routine.
+ if (__offload_use_coi_noncontiguous_transfer && COI::BufferReadMultiD) {
+ struct Arr_Desc* arr_desc_dst;
+ struct Arr_Desc* arr_desc_src;
+ int64_t size_src, size_dst;
+
+ offset_src = (m_vars_extra[i].read_rng_src)?
+ m_vars_extra[i].read_rng_src->init_offset : m_vars[i].disp;
+ size_src = m_vars_extra[i].read_rng_src ?
+ cean_get_transf_size(m_vars_extra[i].read_rng_src) :
+ m_vars[i].size;
+
+ offset_dst = (m_vars_extra[i].read_rng_dst)?
+ m_vars_extra[i].read_rng_dst->init_offset : m_vars_extra[i].cpu_disp;
+ size_dst = m_vars_extra[i].read_rng_dst ?
+ cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
+
+ int64_t el_size = (!m_vars[i].into ||
+ (m_vars_extra[i].read_rng_src &&
+ m_vars_extra[i].read_rng_dst)) ?
+ 1 :
+ m_vars_extra[i].read_rng_src ?
+ m_vars_extra[i].read_rng_src->arr_desc->dim[
+ m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
+ m_vars_extra[i].read_rng_dst->arr_desc->dim[
+ m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
+ arr_desc_src = (m_vars_extra[i].read_rng_src) ?
+ m_vars_extra[i].read_rng_src->arr_desc :
+ make_arr_desc(NULL, // don't required for source
+ offset_src/el_size, size_src/el_size,
+ el_size);
+ arr_desc_dst = !m_vars[i].into ? arr_desc_src :
+ (m_vars_extra[i].read_rng_dst) ?
+ m_vars_extra[i].read_rng_dst->arr_desc :
+ make_arr_desc(NULL,
+ offset_dst/el_size, size_src/el_size, el_size);
+
+ arr_desc_dst->base = reinterpret_cast<int64_t>(base);
+
+ res = COI::BufferReadMultiD(
+ m_vars_extra[i].src_data->mic_buf, // SourceBuffer
+ m_vars[i].offset + m_vars[i].mic_offset -
+ m_vars_extra[i].src_data->alloc_disp, // Offset
+ (void*)arr_desc_dst, // descriptor of DestArray
+ (void*)arr_desc_src, // descriptor of SrcArray
+ COI_COPY_UNSPECIFIED, // Type
+ in_deps_amount, // Number of in Dependencies
+ in_deps, // array of in Dependencies
+ event); // out Dependency
+ if (res != COI_SUCCESS) {
+ if (m_status != 0) {
+ m_status->result = translate_coi_error(res);
+ return false;
+ }
+ report_coi_error(c_buf_copy, res);
+ }
+ return(true);
+ }
+ // if event is defined we must multiplate for all contiguous intervals
+ // that will be Copied/Read.
+ // Take in account that we already have 1 event.
+ if (event) {
+ m_out_deps_allocated += (length_src / receive_size) *
+ ((m_vars_extra[i].read_rng_src) ?
+ m_vars_extra[i].read_rng_src->range_max_number : 1) ;
+ m_out_deps =
+ (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_out_deps_allocated);
+ m_out_deps_total--;
+ }
+
// consequently get contiguous ranges,
- // define corresponded destination offset and recieve data
+ // define corresponded destination offset and receive data
do {
// get sorce offset
if (src_is_empty) {
@@ -2985,8 +3958,8 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data(
break;
}
}
- else if (data_recieved == 0) {
- offset_src = 0;
+ else if (received_data == 0) {
+ offset_src = m_vars[i].disp;
}
else {
break;
@@ -2996,9 +3969,9 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data(
else {
// if source is contiguous or its contiguous range is greater
// than destination one
- offset_src += recieve_size;
+ offset_src += receive_size;
}
- length_src_cur -= recieve_size;
+ length_src_cur -= receive_size;
src_is_empty = length_src_cur == 0;
// get destination offset
@@ -3027,23 +4000,24 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data(
else {
// if destination is contiguous or its contiguous range is greater
// than source one
- offset_dst += recieve_size;
+ offset_dst += receive_size;
}
- length_dst_cur -= recieve_size;
+ length_dst_cur -= receive_size;
dst_is_empty = length_dst_cur == 0;
-
+ if (event) {
+ event = &m_out_deps[m_out_deps_total++];
+ }
if (dst_buf != 0) {
res = COI::BufferCopy(
dst_buf,
m_vars_extra[i].src_data->mic_buf,
m_vars_extra[i].cpu_offset + offset_dst,
m_vars[i].offset + offset_src +
- m_vars[i].mic_offset -
- m_vars_extra[i].src_data->alloc_disp,
- recieve_size,
+ m_vars[i].mic_offset,
+ receive_size,
COI_COPY_UNSPECIFIED,
- m_in_deps_total,
- m_in_deps_total > 0 ? m_in_deps : 0,
+ in_deps_amount,
+ in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -3057,13 +4031,12 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data(
res = COI::BufferRead(
m_vars_extra[i].src_data->mic_buf,
m_vars[i].offset + offset_src +
- m_vars[i].mic_offset -
- m_vars_extra[i].src_data->alloc_disp,
+ m_vars[i].mic_offset,
base + offset_dst,
- recieve_size,
+ receive_size,
COI_COPY_UNSPECIFIED,
- m_in_deps_total,
- m_in_deps_total > 0 ? m_in_deps : 0,
+ in_deps_amount,
+ in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -3073,20 +4046,109 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data(
report_coi_error(c_buf_read, res);
}
}
- data_recieved += recieve_size;
+ received_data += receive_size;
}
while (true);
return true;
}
-bool OffloadDescriptor::receive_pointer_data(bool is_async)
+bool OffloadDescriptor::receive_pointer_data(bool is_async,
+ bool first_run, void *info)
{
OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
+ bool should_use_async_buffer_read = m_initial_need_runfunction;
uint64_t ptr_received = 0;
COIRESULT res;
+ // For offload_transfer and offload with empty body without signal:
+ // - if there is only one buffer copy - get data synchronously
+ // - if there are multiple buffer copy and
+ // __offload_parallel_copy is false - get data synchronously
+ // - if there are multiple buffer copy
+ // and __offload_parallel_copy is true - get data asynchronously
+ // It concerns only data with size greater than __offload_use_async_buffer_read.
+ // Data of size less than __offload_use_async_buffer_read are received synchronously.
+ // Synchronous transfer results in better performance in COI.
+ // __offload_parallel_copy is false by default but can be changed
+ // via environment variable OFFLOAD_PARALLEL_COPY
+ if (!m_initial_need_runfunction && __offload_parallel_copy) {
+ int big_size_count = 0;
+
+ for (int i = 0; i < m_vars_total; i++) {
+ if (m_vars[i].direction.out &&
+ m_vars[i].size >= __offload_use_async_buffer_read) {
+ // preallocated OUT only at second run
+ if (first_run == m_vars[i].flags.preallocated) {
+ continue;
+ }
+ switch (m_vars[i].type.src) {
+ case c_data:
+ case c_void_ptr:
+ case c_cean_var:
+ if (m_vars[i].flags.is_static) {
+ big_size_count++;
+ }
+ break;
+ case c_string_ptr:
+ case c_data_ptr:
+ case c_cean_var_ptr:
+ case c_dv_data:
+ case c_dv_ptr_data:
+ case c_dv_data_slice:
+ case c_dv_ptr_data_slice:
+ case c_dv_ptr:
+ big_size_count++;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if (big_size_count > 1) {
+ should_use_async_buffer_read = true;
+ }
+ }
+ uint32_t in_deps_amount = m_in_deps_total;
+ COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0;
+
+ if (0 == m_in_deps_total &&
+ m_stream != no_stream &&
+ m_vars_total != 0) {
+ get_stream_in_dependencies(in_deps_amount, in_deps);
+ }
+
for (int i = 0; i < m_vars_total; i++) {
+ uint64_t received_data = m_vars[i].size;
+ uint32_t in_deps_amount_save;
+ COIEVENT *in_deps_save;
+
+ if (m_vars_extra[i].omp_last_event_type == c_last_read) {
+ in_deps_amount_save = in_deps_amount;
+ in_deps_save = in_deps;
+
+ in_deps_amount += m_out_deps_total;
+ if (in_deps_amount > 0) {
+ in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * in_deps_amount);
+ if (in_deps == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
+ memcpy(in_deps, in_deps_save,
+ in_deps_amount_save * sizeof(COIEVENT));
+ memcpy(in_deps + in_deps_amount_save * sizeof(COIEVENT),
+ m_out_deps,
+ m_out_deps_total * sizeof(COIEVENT));
+ }
+ }
+ // At first run don't receive by preallocated target pointer as the
+ //pointer value will be ready later after call to scatter_copyout_data
+ if (first_run && m_vars[i].alloc_if && m_vars[i].flags.preallocated) {
+ m_preallocated_alloc = true;
+ // need one more call to OffloadDescriptor::receive_pointer_data
+ if (m_vars[i].direction.out) {
+ m_out_with_preallocated = true;
+ }
+ continue;
+ }
switch (m_vars[i].type.src) {
case c_data_ptr_array:
break;
@@ -3098,7 +4160,8 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
COIEVENT *event =
(is_async ||
m_in_deps_total > 0 ||
- m_vars[i].size >= __offload_use_async_buffer_read) ?
+ (should_use_async_buffer_read &&
+ m_vars[i].size >= __offload_use_async_buffer_read)) ?
&m_out_deps[m_out_deps_total++] : 0;
PtrData *ptr_data = NULL;
COIBUFFER dst_buf = NULL; // buffer at host
@@ -3127,8 +4190,9 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
if (m_vars[i].flags.is_noncont_src ||
m_vars[i].flags.is_noncont_dst) {
- recieve_noncontiguous_pointer_data(
- i, base, dst_buf, event);
+ receive_noncontiguous_pointer_data(
+ i, dst_buf, event, received_data,
+ in_deps_amount, in_deps);
}
else if (dst_buf != 0) {
res = COI::BufferCopy(
@@ -3139,8 +4203,8 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
m_vars[i].offset + m_vars[i].disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- m_in_deps_total,
- m_in_deps_total > 0 ? m_in_deps : 0,
+ in_deps_amount,
+ in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -3158,8 +4222,8 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
m_vars_extra[i].cpu_disp,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- m_in_deps_total,
- m_in_deps_total > 0 ? m_in_deps : 0,
+ in_deps_amount,
+ in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -3169,7 +4233,7 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
report_coi_error(c_buf_read, res);
}
}
- ptr_received += m_vars[i].size;
+ ptr_received += received_data;
}
break;
@@ -3186,7 +4250,8 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
COIEVENT *event =
(is_async ||
m_in_deps_total > 0 ||
- m_vars[i].size >= __offload_use_async_buffer_read) ?
+ (should_use_async_buffer_read &&
+ m_vars[i].size >= __offload_use_async_buffer_read)) ?
&m_out_deps[m_out_deps_total++] : 0;
uint64_t dst_offset = 0;
@@ -3241,8 +4306,10 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
if (m_vars[i].flags.is_noncont_src ||
m_vars[i].flags.is_noncont_dst) {
- recieve_noncontiguous_pointer_data(
- i, base, dst_buf, event);
+ receive_noncontiguous_pointer_data(
+ i, dst_buf, event, received_data,
+ in_deps_amount,
+ in_deps);
}
else if (dst_buf != 0) {
res = COI::BufferCopy(
@@ -3250,12 +4317,11 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
m_vars_extra[i].src_data->mic_buf,
dst_offset,
m_vars[i].offset + m_vars[i].disp +
- m_vars[i].mic_offset -
- m_vars_extra[i].src_data->alloc_disp,
+ m_vars[i].mic_offset,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- m_in_deps_total,
- m_in_deps_total > 0 ? m_in_deps : 0,
+ in_deps_amount,
+ in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -3269,13 +4335,12 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
res = COI::BufferRead(
m_vars_extra[i].src_data->mic_buf,
m_vars[i].offset + m_vars[i].disp +
- m_vars[i].mic_offset -
- m_vars_extra[i].src_data->alloc_disp,
+ m_vars[i].mic_offset,
base + dst_offset,
m_vars[i].size,
COI_COPY_UNSPECIFIED,
- m_in_deps_total,
- m_in_deps_total > 0 ? m_in_deps : 0,
+ in_deps_amount,
+ in_deps,
event);
if (res != COI_SUCCESS) {
if (m_status != 0) {
@@ -3285,7 +4350,7 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
report_coi_error(c_buf_read, res);
}
}
- ptr_received += m_vars[i].size;
+ ptr_received += received_data;
}
break;
}
@@ -3294,6 +4359,11 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
break;
}
+ if (m_vars_extra[i].omp_last_event_type == c_last_read) {
+ in_deps_amount = in_deps_amount_save;
+ in_deps = in_deps_save;
+ register_omp_event_call_back(&m_out_deps[m_out_deps_total - 1], info);
+ }
// destroy buffers for obsolete stacks
if (m_destroy_stack.size() != 0) {
for (PtrDataList::iterator it = m_destroy_stack.begin();
@@ -3312,8 +4382,13 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
m_vars[i].type.src == c_void_ptr ||
m_vars[i].type.src == c_cean_var)) {
AutoData *auto_data = m_vars_extra[i].auto_data;
- if (auto_data != 0 && auto_data->remove_reference() == 0) {
- m_device.remove_auto_data(auto_data->cpu_addr.start());
+ if (auto_data != 0) {
+ if (m_vars[i].flags.always_delete) {
+ auto_data->nullify_reference();
+ }
+ else if(auto_data->remove_reference() == 0) {
+ m_device.remove_auto_data(auto_data->cpu_addr.start());
+ }
}
}
@@ -3338,7 +4413,12 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
ptr_data->cpu_addr.start());
// remove association from map
- m_device.remove_ptr_data(ptr_data->cpu_addr.start());
+ if (m_vars[i].flags.targetptr) {
+ m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
+ }
+ else {
+ m_device.remove_ptr_data(ptr_data->cpu_addr.start());
+ }
}
}
else if (VAR_TYPE_IS_PTR(m_vars[i].type.dst) ||
@@ -3357,7 +4437,12 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async)
ptr_data->cpu_addr.start());
// remove association from map
- m_device.remove_ptr_data(ptr_data->cpu_addr.start());
+ if (m_vars[i].flags.targetptr) {
+ m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
+ }
+ else {
+ m_device.remove_ptr_data(ptr_data->cpu_addr.start());
+ }
}
}
}
@@ -3416,6 +4501,60 @@ bool OffloadDescriptor::scatter_copyout_data()
m_out.init_buffer(data, m_out_datalen);
for (int i = 0; i < m_vars_total; i++) {
+ bool src_is_for_mic = (m_vars[i].direction.out ||
+ m_vars[i].into == NULL);
+
+ if (m_vars[i].type.src != c_data_ptr_array &&
+ m_vars[i].flags.preallocated && m_vars[i].alloc_if) {
+ PtrData *ptr_data;
+ void *ptr_value;
+ void ** cpu_ptr = src_is_for_mic ?
+ reinterpret_cast<void**>(m_vars[i].ptr) :
+ reinterpret_cast<void**>(m_vars[i].into);
+ void* alloc_base = NULL;
+ int64_t alloc_disp = 0;
+ int64_t alloc_size;
+ if (m_vars_extra[i].alloc != NULL) {
+ // array descriptor
+ const Arr_Desc *ap =
+ static_cast<const Arr_Desc*>(m_vars_extra[i].alloc);
+
+ __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
+
+ alloc_base = reinterpret_cast<void*>(ap->base);
+ }
+
+ // get pointer to target memory
+ m_out.receive_data(&ptr_value, sizeof(void*));
+
+ // add new entry
+ if (!alloc_ptr_data(
+ ptr_data,
+ ptr_value,
+ (alloc_base != NULL) ?
+ alloc_disp : m_vars[i].disp,
+ (alloc_base != NULL) ?
+ alloc_size : m_vars[i].size,
+ alloc_disp,
+ 0,
+ m_vars[i].flags.targetptr,
+ m_vars[i].flags.preallocated,
+ m_vars[i].flags.pin)) {
+ return false;
+ }
+
+ ptr_data->add_reference();
+ *cpu_ptr = ptr_value;
+ if (src_is_for_mic) {
+ m_vars_extra[i].src_data = ptr_data;
+ }
+ else {
+ m_vars_extra[i].dst_data = ptr_data;
+ }
+ m_vars[i].offset = (char*) ptr_value -
+ (char*) ptr_data->cpu_addr.start();
+ }
+
switch (m_vars[i].type.src) {
case c_data_ptr_array:
break;
@@ -3478,8 +4617,8 @@ bool OffloadDescriptor::scatter_copyout_data()
return true;
}
-void get_arr_desc_numbers(
- const arr_desc *ap,
+static void get_arr_desc_numbers(
+ const Arr_Desc *ap,
int64_t el_size,
int64_t &offset,
int64_t &size,
@@ -3500,33 +4639,12 @@ void get_arr_desc_numbers(
}
}
-arr_desc * make_arr_desc(
- void* ptr_val,
- int64_t extent_start_val,
- int64_t extent_elements_val,
- int64_t size
-)
-{
- arr_desc *res;
- res = (arr_desc *)malloc(sizeof(arr_desc));
- if (res == NULL)
- LIBOFFLOAD_ERROR(c_malloc);
- res->base = reinterpret_cast<int64_t>(ptr_val);
- res->rank = 1;
- res->dim[0].size = size;
- res->dim[0].lindex = 0;
- res->dim[0].lower = extent_start_val;
- res->dim[0].upper = extent_elements_val + extent_start_val - 1;
- res->dim[0].stride = 1;
- return res;
-}
-
bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
{
int pointers_number;
int tmp_val;
int new_index = m_vars_total;
- const arr_desc *ap;
+ const Arr_Desc *ap;
const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
int flags = vd3->array_fields;
bool src_is_for_mic = (m_vars[i].direction.out ||
@@ -3545,14 +4663,16 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
ReadArrElements<int64_t> alloc_elem;
- ap = static_cast<const arr_desc*>(vd3->ptr_array);
+ ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
- // "pointers_number" for total number of transfered pointers.
+ // "pointers_number" for total number of transferred pointers.
// For each of them we create new var_desc and put it at the bottom
// of the var_desc's array
get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
pointers_number, ptr.ranges);
- ptr.base = reinterpret_cast<char*>(ap->base);
+ ptr.base = (m_vars[i].flags.is_pointer) ?
+ *(reinterpret_cast<char**>(ap->base)) :
+ reinterpret_cast<char*>(ap->base);
// 2. prepare memory for new var_descs
m_vars_total += pointers_number;
@@ -3575,7 +4695,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
// 3. Prepare for reading new var_desc's fields
// EXTENT START
if ((flags & (1<<flag_extent_start_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->extent_start);
+ ap = static_cast<const Arr_Desc*>(vd3->extent_start);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
ext_start.size, tmp_val, ext_start.ranges);
ext_start.base = reinterpret_cast<char*>(ap->base);
@@ -3595,7 +4715,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
// EXTENT ELEMENTS NUMBER
if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->extent_elements);
+ ap = static_cast<const Arr_Desc*>(vd3->extent_elements);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
ext_elements.offset, ext_elements.size,
tmp_val, ext_elements.ranges);
@@ -3616,7 +4736,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
// ALLOC_IF
if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
+ ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
alloc_if.size, tmp_val, alloc_if.ranges);
alloc_if.base = reinterpret_cast<char*>(ap->base);
@@ -3628,12 +4748,12 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
}
}
else {
- alloc_if.val = m_vars[i].count;
+ alloc_if.val = m_vars[i].alloc_if;
}
// FREE_IF
if ((flags & (1<<flag_free_if_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->free_if_array);
+ ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
free_if.size, tmp_val, free_if.ranges);
free_if.base = reinterpret_cast<char*>(ap->base);
@@ -3645,13 +4765,13 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
}
}
else {
- free_if.val = m_vars[i].count;
+ free_if.val = m_vars[i].free_if;
}
// ALIGN
if ((flags & (1<<flag_align_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->align_array);
+ ap = static_cast<const Arr_Desc*>(vd3->align_array);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
align.size, tmp_val, align.ranges);
align.base = reinterpret_cast<char*>(ap->base);
@@ -3669,7 +4789,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
// 3.1 INTO
if (m_vars[i].into) {
- ap = static_cast<const arr_desc*>(m_vars[i].into);
+ ap = static_cast<const Arr_Desc*>(m_vars[i].into);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
into.size, tmp_val, into.ranges);
into.base = reinterpret_cast<char*>(ap->base);
@@ -3683,7 +4803,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
// 3.2 INTO_START
if ((flags & (1<<flag_into_start_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->into_start);
+ ap = static_cast<const Arr_Desc*>(vd3->into_start);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
into_start.size, tmp_val, into_start.ranges);
into_start.base = reinterpret_cast<char*>(ap->base);
@@ -3704,7 +4824,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
// 3.3 INTO_ELEMENTS
if ((flags & (1<<flag_into_elements_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->into_elements);
+ ap = static_cast<const Arr_Desc*>(vd3->into_elements);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
into_elem.size, tmp_val, into_elem.ranges);
into_elem.base = reinterpret_cast<char*>(ap->base);
@@ -3725,7 +4845,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
// alloc_start
if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->alloc_start);
+ ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
alloc_start.offset, alloc_start.size, tmp_val,
alloc_start.ranges);
@@ -3747,7 +4867,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
// alloc_elem
if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
- ap = static_cast<const arr_desc*>(vd3->alloc_elements);
+ ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
alloc_elem.size, tmp_val, alloc_elem.ranges);
alloc_elem.base = reinterpret_cast<char*>(ap->base);
@@ -3846,6 +4966,9 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
m_vars[new_index + k].offset = 0;
m_vars[new_index + k].size = m_vars[i].size;
+ m_vars[new_index + k].flags.targetptr = m_vars[i].flags.targetptr;
+ m_vars[new_index + k].flags.preallocated =
+ m_vars[i].flags.preallocated;
if (ext_start.val == 0) {
m_vars[new_index + k].count = ext_elements.val;
@@ -3901,6 +5024,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
m_vars[new_index + k].type.src = type_src;
m_vars[new_index + k].type.dst = type_dst;
+ m_vars_extra[new_index + k].alloc = m_vars[new_index + k].alloc;
m_vars_extra[new_index + k].is_arr_ptr_el = 1;
m_vars_extra[new_index + k].ptr_arr_offset =
src_is_for_mic ? ptr.offset : into.offset;
@@ -3912,6 +5036,52 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
return true;
}
+// Gets in dependencies of the previous offload via the stream "m_stream".
+// Out argument in_deps_amount - address of amount of the dependencies
+// Out argument in_deps - array of dependencies.
+// Description of the dependencies scheme for streams :
+// ----------------------------------------------------
+// Every offload forms DAG consisted of 3 nodes:
+// for in-transfers, runfunction and out-transfers.
+// Every node has in-dependencies and out-dependencies
+// Out-dependencies of previous node forms in-dependencies of current node.
+// In-dependencies of 1-st node (of in-transfers) without streams is equal
+// to NULL. For streams in-dependencies of 1-st node is equal to list of out
+// dependencies of last node of previous offload via this stream.
+// So we can say that DAGs of 2 consequent offloads via the same stream are
+// connected by the way described above.
+void OffloadDescriptor::get_stream_in_dependencies(
+ uint32_t &in_deps_amount,
+ COIEVENT* &in_deps
+)
+{
+ if (m_stream != no_stream && m_stream != 0) {
+ Stream * stream = Stream::find_stream(m_stream, false);
+ if (!stream) {
+ LIBOFFLOAD_ERROR(c_offload_no_stream,
+ m_device.get_logical_index());
+ LIBOFFLOAD_ABORT;
+ }
+ OffloadDescriptor* offload = stream->get_last_offload();
+
+ // if it's the first offload in the stream
+ if (!offload) {
+ return;
+ }
+ // if last offload has out-tranfers
+ if (offload->m_out_deps_total) {
+ in_deps_amount = offload->m_out_deps_total;
+ in_deps = offload->m_out_deps;
+ }
+ // last offload only sends pointer data or run function or both of them
+ // and has no out-transfers
+ else if (offload->m_in_deps_total) {
+ in_deps_amount = offload->m_in_deps_total;
+ in_deps = offload->m_in_deps;
+ }
+ }
+}
+
static void __offload_fini_library(void)
{
OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
@@ -3945,7 +5115,6 @@ static void __offload_init_library_once(void)
COIRESULT res;
uint32_t num_devices;
std::bitset<MIC_ENGINES_MAX> devices;
-
prefix = report_get_message_str(c_report_host);
// initialize trace
@@ -3989,7 +5158,7 @@ static void __offload_init_library_once(void)
}
// get number of devices installed in the system
- res = COI::EngineGetCount(COI_ISA_KNC, &num_devices);
+ res = COI::EngineGetCount(COI_ISA_MIC, &num_devices);
if (res != COI_SUCCESS) {
return;
}
@@ -4032,7 +5201,7 @@ static void __offload_init_library_once(void)
// use all available devices
for (int i = 0; i < num_devices; i++) {
COIENGINE engine;
- res = COI::EngineGetHandle(COI_ISA_KNC, i, &engine);
+ res = COI::EngineGetHandle(COI_ISA_MIC, i, &engine);
if (res == COI_SUCCESS) {
devices.set(i);
}
@@ -4055,12 +5224,64 @@ static void __offload_init_library_once(void)
}
}
+ // Get DMA channel count to pass it to COI
+ env_var = getenv("OFFLOAD_DMA_CHANNEL_COUNT");
+ if (env_var != 0) {
+ int64_t new_val;
+ if (__offload_parse_int_string(env_var, new_val)) {
+ mic_dma_channel_count = new_val;
+ }
+ else {
+ LIBOFFLOAD_ERROR(c_invalid_env_var_value,
+ "OFFLOAD_DMA_CHANNEL_COUNT");
+ }
+ }
+
+ // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set.
+ // Use putenv instead of setenv as Windows has no setenv.
+ // Note: putenv requires its argument can't be freed or modified.
+ // So no free after call to putenv or elsewhere.
+ env_var = getenv("OFFLOAD_HOST_THREAD_AFFINITY");
+ if (env_var != 0) {
+ char * new_env_var =
+ (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
+ sizeof(env_var) + 1);
+ sprintf(new_env_var, "COI_HOST_THREAD_AFFINITY=%s", env_var);
+ putenv(new_env_var);
+ }
+
// library search path for device binaries
env_var = getenv("MIC_LD_LIBRARY_PATH");
if (env_var != 0) {
mic_library_path = strdup(env_var);
}
+
+ // find target executable to be used if main application is not an
+ // offload build application.
+ const char *base_name = "offload_main";
+ if (mic_library_path != 0) {
+ char *buf = strdup(mic_library_path);
+ char *try_name = (char*) alloca(strlen(mic_library_path) +
+ strlen(base_name) + 2);
+ char *dir, *ptr;
+
+ for (dir = strtok_r(buf, PATH_SEPARATOR, &ptr); dir != 0;
+ dir = strtok_r(0, PATH_SEPARATOR, &ptr)) {
+ // compose a full path
+ sprintf(try_name, "%s/%s", dir, base_name);
+
+ // check if such file exists
+ struct stat st;
+ if (stat(try_name, &st) == 0 && S_ISREG(st.st_mode)) {
+ mic_device_main = strdup(try_name);
+ break;
+ }
+ }
+
+ free(buf);
+ }
+
// memory size reserved for COI buffers
env_var = getenv("MIC_BUFFERSIZE");
if (env_var != 0) {
@@ -4073,6 +5294,30 @@ static void __offload_init_library_once(void)
}
}
+ // memory size reserved for 4K pages for COI buffers
+ env_var = getenv("MIC_4K_BUFFER_RESERVE_SIZE");
+ if (env_var != 0) {
+ uint64_t new_size;
+ if (__offload_parse_size_string(env_var, new_size)) {
+ mic_4k_buffer_size = new_size;
+ }
+ else {
+ LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_4K_BUFFER_RESERVE_SIZE");
+ }
+ }
+
+ // memory size reserved for 2M pages for COI buffers
+ env_var = getenv("MIC_2M_BUFFER_RESERVE_SIZE");
+ if (env_var != 0) {
+ uint64_t new_size;
+ if (__offload_parse_size_string(env_var, new_size)) {
+ mic_2m_buffer_size = new_size;
+ }
+ else {
+ LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_2M_BUFFER_RESERVE_SIZE");
+ }
+ }
+
// determine stacksize for the pipeline on the device
env_var = getenv("MIC_STACKSIZE");
if (env_var != 0 && *env_var != '\0') {
@@ -4170,11 +5415,9 @@ static void __offload_init_library_once(void)
else if (strcmp(env_var, "on_offload_all") == 0) {
__offload_init_type = c_init_on_offload_all;
}
-#ifndef TARGET_WINNT
else if (strcmp(env_var, "on_start") == 0) {
__offload_init_type = c_init_on_start;
}
-#endif // TARGET_WINNT
else {
LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
}
@@ -4206,6 +5449,32 @@ static void __offload_init_library_once(void)
}
}
+ // parallel copy of offload_transfer
+ env_var = getenv(parallel_copy_envname);
+ if (env_var != 0 && *env_var != '\0') {
+ int64_t new_val;
+ if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
+ __offload_parallel_copy = new_val;
+ }
+ else {
+ LIBOFFLOAD_ERROR(c_invalid_env_var_value,
+ parallel_copy_envname);
+ }
+ }
+
+ // use COI interface for noncontiguous arrays transfer
+ env_var = getenv(use_coi_noncontiguous_transfer_envname);
+ if (env_var != 0 && *env_var != '\0') {
+ uint64_t new_size;
+ if (__offload_parse_size_string(env_var, new_size)) {
+ __offload_use_coi_noncontiguous_transfer = new_size;
+ }
+ else {
+ LIBOFFLOAD_ERROR(c_invalid_env_var_value,
+ use_coi_noncontiguous_transfer_envname);
+ }
+ }
+
// init ORSL
ORSL::init();
}
@@ -4242,7 +5511,20 @@ extern int __offload_init_library(void)
return is_available;
}
-extern "C" void __offload_register_image(const void *target_image)
+extern "C" bool __offload_target_image_is_executable(const void *target_image)
+{
+ const struct Image *image = static_cast<const struct Image*>(target_image);
+
+ // decode image
+ const char *name = image->data;
+ const void *data = image->data + strlen(image->data) + 1;
+
+ // determine image type
+ const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
+ return (hdr->e_type == ET_EXEC);
+}
+
+extern "C" bool __offload_register_image(const void *target_image)
{
const struct Image *image = static_cast<const struct Image*>(target_image);
@@ -4250,8 +5532,32 @@ extern "C" void __offload_register_image(const void *target_image)
const char *name = image->data;
const void *data = image->data + strlen(image->data) + 1;
uint64_t size = image->size;
- const char *origin = 0;
+ char *origin = (char *) malloc(strlen(image->data) + 1);
uint64_t offset = 0;
+ const char *host_name = image->data;
+ int i;
+
+ if (origin == NULL)
+ LIBOFFLOAD_ERROR(c_malloc);
+
+ // The origin name is the name of the file on the host
+ // this is used by Vtune, since it is a fat binary we
+ // use the host file name of the fat binary.
+ // Driver prepends the host file name ending with "?"
+ // to the image->data name so need to extract the string
+ i = 0;
+ while (*host_name != '\0' && *host_name != '?') {
+ origin[i] = *host_name;
+ host_name++;
+ i++;
+ }
+ origin[i] = '\0';
+ // Implies the host name does not exist which really should
+ // not occur. Allow this since only consumer is Vtune.
+ if ((i == 0) || (*host_name != '?')) {
+ free(origin);
+ origin = 0;
+ }
// our actions depend on the image type
const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
@@ -4279,19 +5585,31 @@ extern "C" void __offload_register_image(const void *target_image)
}
}
}
- break;
+ return mic_engines_total > 0;
case ET_DYN:
- // Registration code for libraries is called from the DllMain
- // context (on windows) and thus we cannot do anything usefull
- // here. So we just add it to the list of pending libraries for
- // the later use.
- __target_libs_lock.lock();
- __target_libs = true;
- __target_libs_list.push_back(TargetImage(name, data, size,
- origin, offset));
- __target_libs_lock.unlock();
- break;
+ {
+ char *fullname = origin;
+ // We add the library to a list of pending libraries
+ __target_libs_lock.lock();
+ __target_libs = true;
+ __target_libs_list.push_back(
+ TargetImage(name, data, size, fullname, offset));
+ __target_libs_lock.unlock();
+ // If __target_exe is set, then main has started running
+ // If not main, then we can't do anything useful here
+ // because this registration code is called from DllMain
+ // context (on windows).
+ if (__target_exe != 0) {
+ // There is no need to delay loading the library
+ if (!__offload_init_library()) {
+ // Couldn't validate library as a fat offload library
+ LIBOFFLOAD_ERROR(c_unknown_binary_type);
+ exit(1);
+ }
+ }
+ return true;
+ }
default:
// something is definitely wrong, issue an error and exit
@@ -4330,6 +5648,12 @@ extern "C" void __offload_unregister_image(const void *target_image)
__offload_fini_library();
}
+ else if (hdr->e_type == ET_DYN) {
+ for (int i = 0; i < mic_engines_total; i++) {
+ mic_engines[i].unload_library(data, name);
+ }
+
+ }
}
// Runtime trace interface for user programs
@@ -4362,19 +5686,24 @@ int _Offload_signaled(int index, void *signal)
__offload_init_library();
// check index value
- if (index < 0 || mic_engines_total <= 0) {
+ if (index < 0) {
LIBOFFLOAD_ERROR(c_offload_signaled1, index);
LIBOFFLOAD_ABORT;
}
+ index %= mic_engines_total;
+
// find associated async task
OffloadDescriptor *task =
- mic_engines[index % mic_engines_total].find_signal(signal, false);
+ mic_engines[index].find_signal(signal, false);
if (task == 0) {
LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
LIBOFFLOAD_ABORT;
}
-
+ // if signal is removed by wait completing
+ else if (task == SIGNAL_IS_REMOVED) {
+ return (true);
+ }
return task->is_signaled();
}
@@ -4386,6 +5715,153 @@ void _Offload_report(int val)
}
}
+int _Offload_find_associated_mic_memory(
+ int target,
+ const void* cpu_addr,
+ void** cpu_base_addr,
+ uint64_t* buf_length,
+ void** mic_addr,
+ uint64_t* mic_buf_start_offset,
+ int* is_static
+)
+{
+ __offload_init_library();
+
+ // check target value
+ if (target < 0) {
+ LIBOFFLOAD_ERROR(c_offload_signaled1, target);
+ LIBOFFLOAD_ABORT;
+ }
+ target %= mic_engines_total;
+
+ // find existing association in pointer table
+ PtrData* ptr_data = mic_engines[target].find_ptr_data(cpu_addr);
+ if (ptr_data == 0) {
+ OFFLOAD_TRACE(3, "Association does not exist\n");
+ return 0;
+ }
+
+ OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
+ ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
+ ptr_data->is_static);
+
+ if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
+ COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
+ &ptr_data->mic_addr);
+ if (res != COI_SUCCESS) {
+ return 0;
+ }
+ }
+ *cpu_base_addr = const_cast<void *>(ptr_data->cpu_addr.start());
+ *buf_length = ptr_data->cpu_addr.length() - ptr_data->alloc_disp;
+ *mic_addr = (void *)(ptr_data->mic_addr + ptr_data->mic_offset);
+ *mic_buf_start_offset = ptr_data->alloc_disp;
+ *is_static = ptr_data->is_static;
+ return ptr_data->is_static ? 1 : ptr_data->get_reference();
+}
+
+_Offload_stream _Offload_stream_create(
+ int device, // MIC device number
+ int number_of_cpus // Cores allocated to the stream
+ )
+{
+ __offload_init_library();
+
+ // check target value
+ if (device < 0) {
+ LIBOFFLOAD_ERROR(c_offload_signaled1, device);
+ LIBOFFLOAD_ABORT;
+ }
+ device %= mic_engines_total;
+
+ // Create new stream and get its handle
+ _Offload_stream handle = Stream::add_stream(device, number_of_cpus);
+ if (handle == 0) {
+ OFFLOAD_TRACE(3, "Can't create stream\n");
+ return 0;
+ }
+
+ // create pipeline associated with the new stream
+ mic_engines[device].get_pipeline(handle);
+
+ return(handle);
+}
+
+int _Offload_stream_destroy(
+ int device, // MIC device number
+ _Offload_stream handle // stream to destroy
+ )
+{
+ __offload_init_library();
+
+ // check target value
+ if (device < 0) {
+ LIBOFFLOAD_ERROR(c_offload_signaled1, device);
+ LIBOFFLOAD_ABORT;
+ }
+ device %= mic_engines_total;
+
+ mic_engines[device].stream_destroy(handle);
+
+ return(true);
+}
+
+int _Offload_stream_completed(int device, _Offload_stream handler)
+{
+ __offload_init_library();
+
+ // check index value
+ if (device < 0) {
+ LIBOFFLOAD_ERROR(c_offload_signaled1, device);
+ LIBOFFLOAD_ABORT;
+ }
+
+ device %= mic_engines_total;
+
+ // get stream
+ Stream * stream;
+
+ if (handler != 0) {
+ stream = Stream::find_stream(handler, false);
+
+ // the stream was not created or was destroyed
+ if (!stream) {
+ LIBOFFLOAD_ERROR(c_offload_no_stream, device);
+ LIBOFFLOAD_ABORT;
+ }
+
+ // find associated async task
+ OffloadDescriptor *task = stream->get_last_offload();
+
+ // offload was completed by offload_wait pragma or wait clause
+ if (task == 0) {
+ return(true);
+ }
+ return task->is_signaled();
+ }
+ // zero handler is for all streams at the device
+ else {
+ StreamMap stream_map = Stream::all_streams;
+ for (StreamMap::iterator it = stream_map.begin();
+ it != stream_map.end(); it++) {
+ Stream * stream = it->second;
+ // find associated async task
+ OffloadDescriptor *task = stream->get_last_offload();
+
+ // offload was completed by offload_wait pragma or wait clause
+ if (task == 0) {
+ return(true);
+ }
+ // if even one stream is not completed result is false
+ if (!task->is_signaled()) {
+ return false;
+ }
+ }
+ // no uncompleted streams
+ return true;
+ }
+}
+
// IDB support
int __dbg_is_attached = 0;
int __dbg_target_id = -1;