diff options
Diffstat (limited to 'liboffloadmic/runtime/offload_host.cpp')
-rw-r--r-- | liboffloadmic/runtime/offload_host.cpp | 2246 |
1 files changed, 1861 insertions, 385 deletions
diff --git a/liboffloadmic/runtime/offload_host.cpp b/liboffloadmic/runtime/offload_host.cpp index 23a873f..08f626f 100644 --- a/liboffloadmic/runtime/offload_host.cpp +++ b/liboffloadmic/runtime/offload_host.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2014 Intel Corporation. All Rights Reserved. + Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -28,7 +28,8 @@ */ -// Forward declaration as the following 2 functions are declared as friend in offload_engine.h +// Forward declaration as the following 2 functions are declared as friend +// in offload_engine.h. // CLANG does not like static to been after friend declaration. static void __offload_init_library_once(void); static void __offload_fini_library(void); @@ -63,6 +64,55 @@ static void __offload_fini_library(void); #define GET_OFFLOAD_NUMBER(timer_data) \ timer_data? timer_data->offload_number : 0 +extern "C" { +#ifdef TARGET_WINNT +// Windows does not support imports from libraries without actually +// including them as dependence. We don't want to include in the +// dependence since is it used only for Fortran when traceback is enabled. +// Chose to implement it with GetProcAddress. +#define FORTRAN_TRACE_BACK win_for__continue_traceback +int win_for__continue_traceback( _Offload_result coi_offload_result ) +{ + HINSTANCE hDLL; + int (* TraceBackRoutine)(_Offload_result value); + + hDLL = LoadLibrary("libifcoremd.dll"); + if (hDLL != 0) { + TraceBackRoutine = (int (*)(_Offload_result)) GetProcAddress(hDLL, + "for__continue_traceback"); + if (TraceBackRoutine != 0) { + return TraceBackRoutine(coi_offload_result); + } + else { + OFFLOAD_TRACE(3, + "Cannot find for__continue_traceback routine in libifcorert.dll\n"); + exit(1); + } + } + else { + OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n"); + exit(1); + } + return 0; +} + +#else // TARGET_WINNT + +#define FORTRAN_TRACE_BACK for__continue_traceback + +// for__continue_traceback is provided as a dummy to resolve link time symbols +// for C/C++ programs. For Fortran the actual fortran library function in +// libifcore.so is used. +#pragma weak for__continue_traceback +int for__continue_traceback( _Offload_result coi_offload_result ) +{ + OFFLOAD_TRACE(3, + "liboffload function for_continue_traceback should not be called.\n"); + exit(1); +} +#endif //TARGET_WINNT +} // extern "C" + #ifdef TARGET_WINNT // Small subset of ELF declarations for Windows which is needed to compile // this file. ELF header is used to understand what binary type is contained @@ -104,7 +154,16 @@ int offload_number = 0; static const char *htrace_envname = "H_TRACE"; static const char *offload_report_envname = "OFFLOAD_REPORT"; -static char *timer_envname = "H_TIME"; +static const char *timer_envname = "H_TIME"; + +// location of offload_main executable +// To be used if the main application has no offload and is not built +// with -offload but dynamic library linked in has offload pragma +char* mic_device_main = 0; + +// DMA channel count used by COI and set via +// OFFLOAD_DMA_CHANNEL_COUNT environment variable +uint32_t mic_dma_channel_count; // Trace information static const char* vardesc_direction_as_string[] = { @@ -146,6 +205,13 @@ uint32_t mic_stack_size = 12 * 1024 * 1024; // MIC_BUFFERSIZE uint64_t mic_buffer_size = 0; +// Preallocated 4K page memory size for buffers on MIC +uint64_t mic_4k_buffer_size = 0; + +// Preallocated 2M page memory size for buffers on MIC +uint64_t mic_2m_buffer_size = 0; + + // MIC_LD_LIBRARY_PATH char* mic_library_path = 0; @@ -183,6 +249,15 @@ static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT"; int __omp_device_num = 0; static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE"; +//OFFLOAD_PARALLEL_COPY +static bool __offload_parallel_copy = false; +static const char *parallel_copy_envname = "OFFLOAD_PARALLEL_COPY"; + +//Use COI interface for noncontiguous transfer if it exists. +static bool __offload_use_coi_noncontiguous_transfer = false; +static const char *use_coi_noncontiguous_transfer_envname = + "MIC_USE_COI_MULTI_D"; + // The list of pending target libraries static bool __target_libs; static TargetImageList __target_libs_list; @@ -192,6 +267,112 @@ static mutex_t stack_alloc_lock; // Target executable TargetImage* __target_exe; +// Print readable offload flags +static void trace_offload_flags( + OffloadHostTimerData* timer_data, + OffloadFlags offload_flags +) +{ + // Sized big enough for all flag names + char fbuffer[256]; + bool first = true; + if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) { + sprintf(fbuffer, " OffloadFlags=("); + if (offload_flags.bits.fortran_traceback) { + sprintf(fbuffer+strlen(fbuffer), "fortran_traceback"); + first = false; + } + if (offload_flags.bits.omp_async) { + sprintf(fbuffer+strlen(fbuffer), first ? "omp_async" : ",omp_async"); + first = false; + } + OFFLOAD_DEBUG_TRACE_1(1, + GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func, + "%s)\n", fbuffer); + } +} + +// Print readable varDesc flags +static void trace_varDesc_flags( + OffloadHostTimerData* timer_data, + varDescFlags offload_flags +) +{ + // SIzed big enough for all flag names + char fbuffer[256]; + bool first = true; + if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) { + sprintf(fbuffer, " varDescFlags=("); + if (offload_flags.is_static) { + sprintf(fbuffer+strlen(fbuffer), "is_static"); + first = false; + } + if (offload_flags.is_static_dstn) { + sprintf(fbuffer+strlen(fbuffer), + first ? "is_static_dstn" : ",is_static_dstn"); + first = false; + } + if (offload_flags.has_length) { + sprintf(fbuffer+strlen(fbuffer), + first ? "has_length" : ",has_length"); + first = false; + } + if (offload_flags.is_stack_buf) { + sprintf(fbuffer+strlen(fbuffer), + first ? "is_stack_buf" : ",is_stack_buf"); + first = false; + } + if (offload_flags.targetptr) { + sprintf(fbuffer+strlen(fbuffer), + first ? "targetptr" : ",targetptr"); + first = false; + } + if (offload_flags.preallocated) { + sprintf(fbuffer+strlen(fbuffer), + first ? "preallocated" : ",preallocated"); + first = false; + } + if (offload_flags.is_pointer) { + sprintf(fbuffer+strlen(fbuffer), + first ? "is_pointer" : ",is_pointer"); + first = false; + } + if (offload_flags.sink_addr) { + sprintf(fbuffer+strlen(fbuffer), + first ? "sink_addr" : ",sink_addr"); + first = false; + } + if (offload_flags.alloc_disp) { + sprintf(fbuffer+strlen(fbuffer), + first ? "alloc_disp" : ",alloc_disp"); + first = false; + } + if (offload_flags.is_noncont_src) { + sprintf(fbuffer+strlen(fbuffer), + first ? "is_noncont_src" : ",is_noncont_src"); + first = false; + } + if (offload_flags.is_noncont_dst) { + sprintf(fbuffer+strlen(fbuffer), + first ? "is_noncont_dst" : ",is_noncont_dst"); + first = false; + } + if (offload_flags.always_copy) { + sprintf(fbuffer+strlen(fbuffer), + first ? "always_copy" : ",always_copy"); + first = false; + } + if (offload_flags.always_delete) { + sprintf(fbuffer+strlen(fbuffer), + first ? "always_delete" : ",always_delete"); + first = false; + } + OFFLOAD_DEBUG_TRACE_1(1, + GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func, + "%s)\n", fbuffer); + } +} + static char * offload_get_src_base(void * ptr, uint8_t type) { char *base; @@ -204,7 +385,7 @@ static char * offload_get_src_base(void * ptr, uint8_t type) else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) { ArrDesc *dvp; if (VAR_TYPE_IS_DV_DATA_SLICE(type)) { - const arr_desc *ap = static_cast<const arr_desc*>(ptr); + const Arr_Desc *ap = static_cast<const Arr_Desc*>(ptr); dvp = (type == c_dv_data_slice) ? reinterpret_cast<ArrDesc*>(ap->base) : *reinterpret_cast<ArrDesc**>(ap->base); @@ -278,130 +459,228 @@ _Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const } } +// is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data; +// is_targetptr == 1 && is_prealloc == 0 - allocation of target memory: +// allocate memory at target; use its value as base in target table. +// is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory: +// base - is address at target of preallocated memory; use its value as +// base in target table. + bool OffloadDescriptor::alloc_ptr_data( PtrData* &ptr_data, void *base, int64_t disp, int64_t size, int64_t alloc_disp, - int align + int align, + bool is_targptr, + bool is_prealloc, + bool pin ) { // total length of base - int64_t length = disp + size; + int64_t length = size; bool is_new; + COIBUFFER targptr_buf; + COIRESULT res; + uint32_t buffer_flags = 0; + char * base_disp = reinterpret_cast<char *>(base) + disp; - OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n", - base, length); + // create buffer with large pages if data length exceeds + // large page threshold + if (length >= __offload_use_2mb_buffers) { + buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE; + } + // Allocate memory at target for targetptr without preallocated as we need + // its address as base argument in call to m_device.insert_ptr_data + if (is_targptr && !is_prealloc) { + length = alloc_disp ? length : size + disp; + res = COI::BufferCreate( + length, + COI_BUFFER_NORMAL, + buffer_flags, + 0, + 1, + &m_device.get_process(), + &targptr_buf); + if (res != COI_SUCCESS) { + if (m_status != 0) { + m_status->result = translate_coi_error(res); + } + else if (m_is_mandatory) { + report_coi_error(c_buf_create, res); + } + return false; + } + + res = COI::BufferGetSinkAddress( + targptr_buf, reinterpret_cast<uint64_t *>(&base)); + if (res != COI_SUCCESS) { + if (m_status != 0) { + m_status->result = translate_coi_error(res); + } + else if (m_is_mandatory) { + report_coi_error(c_buf_get_address, res); + } + return false; + } + } + OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n", + alloc_disp ? base : base_disp, + alloc_disp ? length : size + disp); + // add new entry - ptr_data = m_device.insert_ptr_data(base, length, is_new); + + ptr_data = is_targptr ? + m_device.find_targetptr_data(base_disp) : + m_device.find_ptr_data(base_disp); + // if ptr_data is found just need to check it for overlapping + if (ptr_data) { + is_new = false; + base = base_disp; + } + else { + // If association is not found we must create it. + length = alloc_disp ? length : size + disp; + ptr_data = is_targptr ? + m_device.insert_targetptr_data(base, length, is_new) : + m_device.insert_ptr_data(base, length, is_new); + } if (is_new) { OFFLOAD_TRACE(3, "Added new association\n"); if (length > 0) { OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers); - COIRESULT res; // align should be a power of 2 - if (align > 0 && (align & (align - 1)) == 0) { + if (!pin && !is_targptr && + align > 0 && (align & (align - 1)) == 0) { // offset within mic_buffer. Can do offset optimization // only when source address alignment satisfies requested // alignment on the target (cq172736). if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) { - ptr_data->mic_offset = reinterpret_cast<intptr_t>(base) & 4095; + ptr_data->mic_offset = + reinterpret_cast<intptr_t>(base) & 4095; } } // buffer size and flags uint64_t buffer_size = length + ptr_data->mic_offset; - uint32_t buffer_flags = 0; - // create buffer with large pages if data length exceeds - // large page threshold - if (length >= __offload_use_2mb_buffers) { - buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE; - } - - // create CPU buffer - OFFLOAD_DEBUG_TRACE_1(3, + // For targetptr there is no CPU buffer + if (pin || !is_targptr) { + // create CPU buffer + OFFLOAD_DEBUG_TRACE_1(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_create_buf_host, "Creating buffer from source memory %p, " "length %lld\n", base, length); - // result is not checked because we can continue without cpu - // buffer. In this case we will use COIBufferRead/Write instead - // of COIBufferCopy. - COI::BufferCreateFromMemory(length, + // result is not checked because we can continue without cpu + // buffer. In this case we will use COIBufferRead/Write + // instead of COIBufferCopy. + + COI::BufferCreateFromMemory(length, COI_BUFFER_NORMAL, 0, base, 1, &m_device.get_process(), &ptr_data->cpu_buf); + } - OFFLOAD_DEBUG_TRACE_1(3, + // create MIC buffer + if (is_prealloc) { + OFFLOAD_DEBUG_TRACE_1(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_create_buf_mic, - "Creating buffer for sink: size %lld, offset %d, " - "flags =0x%x\n", buffer_size - alloc_disp, + "Creating buffer from sink memory: size %lld, offset %d, " + "flags =0x%x\n", buffer_size, ptr_data->mic_offset, buffer_flags); - - // create MIC buffer - res = COI::BufferCreate(buffer_size - alloc_disp, - COI_BUFFER_NORMAL, - buffer_flags, - 0, - 1, - &m_device.get_process(), - &ptr_data->mic_buf); - if (res != COI_SUCCESS) { - if (m_status != 0) { - m_status->result = translate_coi_error(res); - } - else if (m_is_mandatory) { - report_coi_error(c_buf_create, res); + res = COI::BufferCreateFromMemory(ptr_data->cpu_addr.length(), + COI_BUFFER_NORMAL, + COI_SINK_MEMORY, + base, + 1, + &m_device.get_process(), + &ptr_data->mic_buf); + if (res != COI_SUCCESS) { + if (m_status != 0) { + m_status->result = translate_coi_error(res); + } + else if (m_is_mandatory) { + report_coi_error(c_buf_create, res); + } + ptr_data->alloc_ptr_data_lock.unlock(); + return false; } - ptr_data->alloc_ptr_data_lock.unlock(); - return false; } - - // make buffer valid on the device. - res = COI::BufferSetState(ptr_data->mic_buf, - m_device.get_process(), - COI_BUFFER_VALID, - COI_BUFFER_NO_MOVE, - 0, 0, 0); - if (res != COI_SUCCESS) { - if (m_status != 0) { - m_status->result = translate_coi_error(res); - } - else if (m_is_mandatory) { - report_coi_error(c_buf_set_state, res); + else if (is_targptr) { + ptr_data->mic_buf = targptr_buf; + } + else if (!pin) { + OFFLOAD_DEBUG_TRACE_1(3, + GET_OFFLOAD_NUMBER(get_timer_data()), + c_offload_create_buf_mic, + "Creating buffer for sink: size %lld, offset %d, " + "flags =0x%x\n", buffer_size, + ptr_data->mic_offset, buffer_flags); + res = COI::BufferCreate(buffer_size, + COI_BUFFER_NORMAL, + buffer_flags, + 0, + 1, + &m_device.get_process(), + &ptr_data->mic_buf); + if (res != COI_SUCCESS) { + if (m_status != 0) { + m_status->result = translate_coi_error(res); + } + else if (m_is_mandatory) { + report_coi_error(c_buf_create, res); + } + ptr_data->alloc_ptr_data_lock.unlock(); + return false; } - ptr_data->alloc_ptr_data_lock.unlock(); - return false; } - res = COI::BufferSetState(ptr_data->mic_buf, - COI_PROCESS_SOURCE, - COI_BUFFER_INVALID, - COI_BUFFER_NO_MOVE, - 0, 0, 0); - if (res != COI_SUCCESS) { - if (m_status != 0) { - m_status->result = translate_coi_error(res); + if (!pin) { + // make buffer valid on the device. + res = COI::BufferSetState(ptr_data->mic_buf, + m_device.get_process(), + COI_BUFFER_VALID, + COI_BUFFER_NO_MOVE, + 0, 0, 0); + if (res != COI_SUCCESS) { + if (m_status != 0) { + m_status->result = translate_coi_error(res); + } + else if (m_is_mandatory) { + report_coi_error(c_buf_set_state, res); + } + ptr_data->alloc_ptr_data_lock.unlock(); + return false; } - else if (m_is_mandatory) { - report_coi_error(c_buf_set_state, res); + + res = COI::BufferSetState(ptr_data->mic_buf, + COI_PROCESS_SOURCE, + COI_BUFFER_INVALID, + COI_BUFFER_NO_MOVE, + 0, 0, 0); + if (res != COI_SUCCESS) { + if (m_status != 0) { + m_status->result = translate_coi_error(res); + } + else if (m_is_mandatory) { + report_coi_error(c_buf_set_state, res); + } + ptr_data->alloc_ptr_data_lock.unlock(); + return false; } - ptr_data->alloc_ptr_data_lock.unlock(); - return false; } } - ptr_data->alloc_disp = alloc_disp; ptr_data->alloc_ptr_data_lock.unlock(); } @@ -415,9 +694,11 @@ bool OffloadDescriptor::alloc_ptr_data( // This is not a new entry. Make sure that provided address range fits // into existing one. - MemRange addr_range(base, length - ptr_data->alloc_disp); + MemRange addr_range(base, length); if (!ptr_data->cpu_addr.contains(addr_range)) { - LIBOFFLOAD_ERROR(c_bad_ptr_mem_range); + LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc, base, length, + const_cast<void *>(ptr_data->cpu_addr.start()), + ptr_data->cpu_addr.length()); exit(1); } @@ -433,20 +714,24 @@ bool OffloadDescriptor::alloc_ptr_data( bool OffloadDescriptor::find_ptr_data( PtrData* &ptr_data, - void *base, + void *in_base, int64_t disp, int64_t size, + bool is_targetptr, bool report_error ) { // total length of base - int64_t length = disp + size; - + int64_t length = size; + char *base = reinterpret_cast<char *>(in_base) + disp; + OFFLOAD_TRACE(3, "Looking for association for data: addr %p, " "length %lld\n", base, length); // find existing association in pointer table - ptr_data = m_device.find_ptr_data(base); + ptr_data = is_targetptr ? + m_device.find_targetptr_data(base) : + m_device.find_ptr_data(base); if (ptr_data == 0) { if (report_error) { LIBOFFLOAD_ERROR(c_no_ptr_data, base); @@ -464,7 +749,9 @@ bool OffloadDescriptor::find_ptr_data( MemRange addr_range(base, length); if (!ptr_data->cpu_addr.contains(addr_range)) { if (report_error) { - LIBOFFLOAD_ERROR(c_bad_ptr_mem_range); + LIBOFFLOAD_ERROR(c_bad_ptr_mem_range, base, length, + const_cast<void *>(ptr_data->cpu_addr.start()), + ptr_data->cpu_addr.length()); exit(1); } OFFLOAD_TRACE(3, "Existing association partially overlaps with " @@ -591,6 +878,7 @@ bool OffloadDescriptor::offload_stack_memory_manager( PersistDataList::iterator it_begin = m_device.m_persist_list.begin(); PersistDataList::iterator it_end; int erase = 0; + uint64_t cur_thread_id = m_device.get_thread_id(); *is_new = false; @@ -600,9 +888,11 @@ bool OffloadDescriptor::offload_stack_memory_manager( if (stack_begin > it->stack_cpu_addr) { // this stack data must be destroyed - m_destroy_stack.push_front(cur_el.stack_ptr_data); - it_end = it; - erase++; + if (cur_thread_id == cur_el.thread_id) { + m_destroy_stack.push_front(cur_el.stack_ptr_data); + it_end = it; + erase++; + } } else if (stack_begin == it->stack_cpu_addr) { if (routine_id != it-> routine_id) { @@ -627,7 +917,8 @@ bool OffloadDescriptor::offload_stack_memory_manager( return true; } } - else if (stack_begin < it->stack_cpu_addr) { + else if (stack_begin < it->stack_cpu_addr && + cur_thread_id == cur_el.thread_id) { break; } } @@ -638,7 +929,7 @@ bool OffloadDescriptor::offload_stack_memory_manager( m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr); } // new stack table is created - new_el = new PersistData(stack_begin, routine_id, buf_size); + new_el = new PersistData(stack_begin, routine_id, buf_size, cur_thread_id); // create MIC buffer COIRESULT res; uint32_t buffer_flags = 0; @@ -733,11 +1024,13 @@ bool OffloadDescriptor::setup_descriptors( } // dependencies - m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * (m_vars_total + 1)); + m_in_deps_allocated = m_vars_total + 1; + m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_in_deps_allocated); if (m_in_deps == NULL) LIBOFFLOAD_ERROR(c_malloc); if (m_vars_total > 0) { - m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_vars_total); + m_out_deps_allocated = m_vars_total; + m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_out_deps_allocated); if (m_out_deps == NULL) LIBOFFLOAD_ERROR(c_malloc); } @@ -752,7 +1045,7 @@ bool OffloadDescriptor::setup_descriptors( for (int i = 0; i < m_vars_total; i++) { void* alloc_base = NULL; int64_t alloc_disp = 0; - int64_t alloc_size; + int64_t alloc_size = 0; bool src_is_for_mic = (m_vars[i].direction.out || m_vars[i].into == NULL); @@ -787,25 +1080,41 @@ bool OffloadDescriptor::setup_descriptors( m_vars[i].count, m_vars[i].ptr, m_vars[i].into); + // If any varDesc flags bits set, show them + if (console_enabled >= 1 && m_vars[i].flags.bits != 0) { + trace_varDesc_flags(get_timer_data(), m_vars[i].flags); + } + // preallocated implies targetptr + if (m_vars[i].flags.preallocated) { + // targetptr preallocated alloc_if(1) may not be used with + // an in clause + if (m_vars[i].direction.in && m_vars[i].alloc_if) { + LIBOFFLOAD_ERROR(c_in_with_preallocated); + exit(1); + } + m_vars[i].flags.targetptr = 1; + } if (m_vars[i].alloc != NULL) { // array descriptor - const arr_desc *ap = - static_cast<const arr_desc*>(m_vars[i].alloc); + const Arr_Desc *ap = + static_cast<const Arr_Desc*>(m_vars[i].alloc); // debug dump - __arr_desc_dump(" ", "ALLOC", ap, 0); + ARRAY_DESC_DUMP(" ", "ALLOC", ap, 0, 1); __arr_data_offset_and_length(ap, alloc_disp, alloc_size); alloc_base = reinterpret_cast<void*>(ap->base); } + m_vars_extra[i].alloc = m_vars[i].alloc; m_vars_extra[i].cpu_disp = 0; m_vars_extra[i].cpu_offset = 0; m_vars_extra[i].src_data = 0; m_vars_extra[i].read_rng_src = 0; m_vars_extra[i].read_rng_dst = 0; + m_vars_extra[i].omp_last_event_type = c_last_not; // flag is_arr_ptr_el is 1 only for var_descs generated // for c_data_ptr_array type if (i < vars_total) { @@ -815,7 +1124,7 @@ bool OffloadDescriptor::setup_descriptors( switch (m_vars[i].type.src) { case c_data_ptr_array: { - const arr_desc *ap; + const Arr_Desc *ap; const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr); int flags = vd3->array_fields; @@ -824,32 +1133,33 @@ bool OffloadDescriptor::setup_descriptors( OFFLOAD_TRACE(2, " pointer array type is %s\n", vardesc_type_as_string[flags & 0x3f]); - ap = static_cast<const arr_desc*>(vd3->ptr_array); - __arr_desc_dump(" ", "ptr array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->ptr_array); + ARRAY_DESC_DUMP(" ", "ptr array", ap, + m_vars[i].flags.is_pointer, 1); if (m_vars[i].into) { - ap = static_cast<const arr_desc*>(m_vars[i].into); - __arr_desc_dump( - " ", "into array", ap, 0); + ap = static_cast<const Arr_Desc*>(m_vars[i].into); + ARRAY_DESC_DUMP( + " ", "into array", ap, 0, 1); } if ((flags & (1<<flag_align_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->align_array); - __arr_desc_dump( - " ", "align array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->align_array); + ARRAY_DESC_DUMP( + " ", "align array", ap, 0, 1); } if ((flags & (1<<flag_alloc_if_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->alloc_if_array); - __arr_desc_dump( - " ", "alloc_if array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array); + ARRAY_DESC_DUMP( + " ", "alloc_if array", ap, 0, 1); } if ((flags & (1<<flag_free_if_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->free_if_array); - __arr_desc_dump( - " ", "free_if array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->free_if_array); + ARRAY_DESC_DUMP( + " ", "free_if array", ap, 0, 1); } if ((flags & (1<<flag_extent_start_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->extent_start); - __arr_desc_dump( - " ", "extent_start array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->extent_start); + ARRAY_DESC_DUMP( + " ", "extent_start array", ap, 0, 1); } else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) { OFFLOAD_TRACE(2, @@ -857,10 +1167,10 @@ bool OffloadDescriptor::setup_descriptors( (int64_t)vd3->extent_start); } if ((flags & (1<<flag_extent_elements_is_array)) != 0) { - ap = static_cast<const arr_desc*> + ap = static_cast<const Arr_Desc*> (vd3->extent_elements); - __arr_desc_dump( - " ", "extent_elements array", ap, 0); + ARRAY_DESC_DUMP(" ", + "extent_elements array", ap, 0, 1); } else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) { OFFLOAD_TRACE(2, @@ -868,9 +1178,9 @@ bool OffloadDescriptor::setup_descriptors( (int64_t)vd3->extent_elements); } if ((flags & (1<<flag_into_start_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->into_start); - __arr_desc_dump( - " ", "into_start array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->into_start); + ARRAY_DESC_DUMP( + " ", "into_start array", ap, 0, 1); } else if ((flags & (1<<flag_into_start_is_scalar)) != 0) { OFFLOAD_TRACE(2, @@ -878,9 +1188,9 @@ bool OffloadDescriptor::setup_descriptors( (int64_t)vd3->into_start); } if ((flags & (1<<flag_into_elements_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->into_elements); - __arr_desc_dump( - " ", "into_elements array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->into_elements); + ARRAY_DESC_DUMP( + " ", "into_elements array", ap, 0, 1); } else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) { OFFLOAD_TRACE(2, @@ -888,9 +1198,9 @@ bool OffloadDescriptor::setup_descriptors( (int64_t)vd3->into_elements); } if ((flags & (1<<flag_alloc_start_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->alloc_start); - __arr_desc_dump( - " ", "alloc_start array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->alloc_start); + ARRAY_DESC_DUMP( + " ", "alloc_start array", ap, 0, 1); } else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) { OFFLOAD_TRACE(2, @@ -898,9 +1208,9 @@ bool OffloadDescriptor::setup_descriptors( (int64_t)vd3->alloc_start); } if ((flags & (1<<flag_alloc_elements_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->alloc_elements); - __arr_desc_dump( - " ", "alloc_elements array", ap, 0); + ap = static_cast<const Arr_Desc*>(vd3->alloc_elements); + ARRAY_DESC_DUMP(" ", + "alloc_elements array", ap, 0, 1); } else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) { OFFLOAD_TRACE(2, @@ -922,11 +1232,11 @@ bool OffloadDescriptor::setup_descriptors( // VarDesc.disp will have an offset from base if (m_vars[i].type.src == c_cean_var) { // array descriptor - const arr_desc *ap = - static_cast<const arr_desc*>(m_vars[i].ptr); + const Arr_Desc *ap = + static_cast<const Arr_Desc*>(m_vars[i].ptr); // debug dump - __arr_desc_dump("", "IN/OUT", ap, 0); + ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic); // offset and length are derived from the array descriptor __arr_data_offset_and_length(ap, m_vars[i].disp, @@ -961,7 +1271,7 @@ bool OffloadDescriptor::setup_descriptors( m_vars[i].ptr, m_vars[i].disp, m_vars[i].size, - false)) { + false, false)) { return false; } @@ -983,10 +1293,11 @@ bool OffloadDescriptor::setup_descriptors( if (m_is_openmp) { if (m_vars[i].flags.is_static) { - // Static data is transferred only by omp target + // Static data is transferred either by omp target // update construct which passes zeros for - // alloc_if and free_if. - if (m_vars[i].alloc_if || m_vars[i].free_if) { + // alloc_if and free_if or by always modifier. + if (!m_vars[i].flags.always_copy && + (m_vars[i].alloc_if || m_vars[i].free_if)) { m_vars[i].direction.bits = c_parameter_nocopy; } } @@ -1004,10 +1315,12 @@ bool OffloadDescriptor::setup_descriptors( m_vars[i].ptr); } - // For automatic variables data is transferred - // only if alloc_if == 0 && free_if == 0 - // or reference count is 1 - if ((m_vars[i].alloc_if || m_vars[i].free_if) && + // For automatic variables data is transferred: + // - if always modifier is used OR + // - if alloc_if == 0 && free_if == 0 OR + // - if reference count is 1 + if (!m_vars[i].flags.always_copy && + (m_vars[i].alloc_if || m_vars[i].free_if) && auto_data != 0 && auto_data->get_reference() != 1) { m_vars[i].direction.bits = c_parameter_nocopy; @@ -1088,8 +1401,12 @@ bool OffloadDescriptor::setup_descriptors( } m_vars[i].size = m_destroy_stack.size(); m_vars_extra[i].src_data = m_stack_ptr_data; - // need to add reference for buffer - m_need_runfunction = true; + + // need to add or remove references for stack buffer at target + if (is_new || m_destroy_stack.size()) { + m_need_runfunction = true; + } + break; } /* fallthru */ @@ -1098,11 +1415,11 @@ bool OffloadDescriptor::setup_descriptors( case c_dv_ptr: if (m_vars[i].type.src == c_cean_var_ptr) { // array descriptor - const arr_desc *ap = - static_cast<const arr_desc*>(m_vars[i].ptr); + const Arr_Desc *ap = + static_cast<const Arr_Desc*>(m_vars[i].ptr); // debug dump - __arr_desc_dump("", "IN/OUT", ap, 1); + ARRAY_DESC_DUMP("", "IN/OUT", ap, 1, !src_is_for_mic); // offset and length are derived from the array descriptor __arr_data_offset_and_length(ap, m_vars[i].disp, @@ -1145,9 +1462,10 @@ bool OffloadDescriptor::setup_descriptors( m_vars[i].free_if) { PtrData *ptr_data; - // check that buffer length >= 0 + // check that buffer length > 0 if (m_vars[i].alloc_if && - m_vars[i].disp + m_vars[i].size < 0) { + m_vars[i].disp + m_vars[i].size < + (m_is_openmp ? 0 : 1)) { LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len); exit(1); } @@ -1166,20 +1484,34 @@ bool OffloadDescriptor::setup_descriptors( m_vars[i].flags.sink_addr = 1; } else if (m_vars[i].alloc_if) { + if (m_vars[i].flags.preallocated) { + m_out_datalen += sizeof(void*); + m_need_runfunction = true; + break; + } // add new entry if (!alloc_ptr_data( ptr_data, - base, + reinterpret_cast<char *>(base) + alloc_disp, (alloc_base != NULL) ? alloc_disp : m_vars[i].disp, (alloc_base != NULL) ? alloc_size : m_vars[i].size, alloc_disp, (alloc_base != NULL) ? - 0 : m_vars[i].align)) { + 0 : m_vars[i].align, + m_vars[i].flags.targetptr, + 0, + m_vars[i].flags.pin)) { return false; } - + if (m_vars[i].flags.targetptr) { + if (!init_mic_address(ptr_data)) { + return false; + } + *static_cast<void**>(m_vars[i].ptr) = base = + reinterpret_cast<void*>(ptr_data->mic_addr); + } if (ptr_data->add_reference() == 0 && ptr_data->mic_buf != 0) { // add buffer to the list of buffers that @@ -1187,12 +1519,14 @@ bool OffloadDescriptor::setup_descriptors( m_compute_buffers.push_back( ptr_data->mic_buf); } - else { + else if (!m_vars[i].flags.pin && + !m_vars[i].flags.preallocated) { // will send buffer address to device m_vars[i].flags.sink_addr = 1; } - if (!ptr_data->is_static) { + if (!m_vars[i].flags.pin && + !ptr_data->is_static) { // need to add reference for buffer m_need_runfunction = true; } @@ -1202,8 +1536,9 @@ bool OffloadDescriptor::setup_descriptors( if (m_is_openmp) { // For omp target update variable is ignored // if it does not exist. - if (!m_vars[i].alloc_if && - !m_vars[i].free_if) { + if (m_vars[i].flags.always_copy || + (!m_vars[i].alloc_if && + !m_vars[i].free_if)) { error_if_not_found = false; } } @@ -1213,6 +1548,7 @@ bool OffloadDescriptor::setup_descriptors( base, m_vars[i].disp, m_vars[i].size, + m_vars[i].flags.targetptr, error_if_not_found)) { return false; } @@ -1235,9 +1571,10 @@ bool OffloadDescriptor::setup_descriptors( // data is transferred only if // alloc_if == 0 && free_if == 0 // or reference count is 1 - if ((m_vars[i].alloc_if || - m_vars[i].free_if) && - ptr_data->get_reference() != 1) { + if (!m_vars[i].flags.always_copy && + ((m_vars[i].alloc_if || + m_vars[i].free_if) && + ptr_data->get_reference() != 1)) { m_vars[i].direction.bits = c_parameter_nocopy; } @@ -1257,7 +1594,8 @@ bool OffloadDescriptor::setup_descriptors( m_in_datalen += sizeof(ptr_data->mic_addr); } - if (!ptr_data->is_static && m_vars[i].free_if) { + if (!m_vars[i].flags.pin && + !ptr_data->is_static && m_vars[i].free_if) { // need to decrement buffer reference on target m_need_runfunction = true; } @@ -1277,7 +1615,7 @@ bool OffloadDescriptor::setup_descriptors( base, m_vars[i].disp, m_vars[i].size, - false)) { + false, false)) { return false; } if (ptr_data) { @@ -1308,8 +1646,8 @@ bool OffloadDescriptor::setup_descriptors( case c_dv_ptr_data_slice: ArrDesc *dvp; if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) { - const arr_desc *ap; - ap = static_cast<const arr_desc*>(m_vars[i].ptr); + const Arr_Desc *ap; + ap = static_cast<const Arr_Desc*>(m_vars[i].ptr); dvp = (m_vars[i].type.src == c_dv_data_slice) ? reinterpret_cast<ArrDesc*>(ap->base) : @@ -1331,13 +1669,13 @@ bool OffloadDescriptor::setup_descriptors( if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { - const arr_desc *ap; + const Arr_Desc *ap; if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) { - ap = static_cast<const arr_desc*>(m_vars[i].ptr); + ap = static_cast<const Arr_Desc*>(m_vars[i].ptr); // debug dump - __arr_desc_dump("", "IN/OUT", ap, 0); + ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic); } if (!__dv_is_contiguous(dvp)) { m_vars[i].flags.is_noncont_src = 1; @@ -1393,14 +1731,17 @@ bool OffloadDescriptor::setup_descriptors( // add new entry if (!alloc_ptr_data( ptr_data, - base, + reinterpret_cast<char *>(base) + alloc_disp, (alloc_base != NULL) ? alloc_disp : m_vars[i].disp, (alloc_base != NULL) ? alloc_size : m_vars[i].size, alloc_disp, (alloc_base != NULL) ? - 0 : m_vars[i].align)) { + 0 : m_vars[i].align, + m_vars[i].flags.targetptr, + m_vars[i].flags.preallocated, + m_vars[i].flags.pin)) { return false; } @@ -1426,8 +1767,9 @@ bool OffloadDescriptor::setup_descriptors( if (m_is_openmp) { // For omp target update variable is ignored // if it does not exist. - if (!m_vars[i].alloc_if && - !m_vars[i].free_if) { + if (m_vars[i].flags.always_copy || + (!m_vars[i].alloc_if && + !m_vars[i].free_if)) { error_if_not_found = false; } } @@ -1437,6 +1779,7 @@ bool OffloadDescriptor::setup_descriptors( base, m_vars[i].disp, m_vars[i].size, + m_vars[i].flags.targetptr, error_if_not_found)) { return false; } @@ -1457,10 +1800,12 @@ bool OffloadDescriptor::setup_descriptors( if (ptr_data != 0) { if (m_is_openmp) { - // data is transferred only if - // alloc_if == 0 && free_if == 0 - // or reference count is 1 - if ((m_vars[i].alloc_if || + // data is transferred if + // - if always modifier is used OR + // - if alloc_if == 0 && free_if == 0 OR + // - if reference count is 1 + if (!m_vars[i].flags.always_copy && + (m_vars[i].alloc_if || m_vars[i].free_if) && ptr_data->get_reference() != 1) { m_vars[i].direction.bits = @@ -1503,7 +1848,7 @@ bool OffloadDescriptor::setup_descriptors( base, m_vars[i].disp, m_vars[i].size, - false)) { + false, false)) { return false; } m_vars[i].offset = !ptr_data ? 0 : @@ -1551,11 +1896,11 @@ bool OffloadDescriptor::setup_descriptors( if (m_vars[i].type.dst == c_cean_var) { // array descriptor - const arr_desc *ap = - static_cast<const arr_desc*>(m_vars[i].into); + const Arr_Desc *ap = + static_cast<const Arr_Desc*>(m_vars[i].into); // debug dump - __arr_desc_dump(" ", "INTO", ap, 0); + ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic); // offset and length are derived from the array descriptor __arr_data_offset_and_length(ap, into_disp, size); @@ -1594,7 +1939,7 @@ bool OffloadDescriptor::setup_descriptors( // find data associated with variable if (!find_ptr_data(ptr_data, m_vars[i].into, - into_disp, size, false)) { + into_disp, size, false, false)) { return false; } if (ptr_data != 0) { @@ -1648,11 +1993,11 @@ bool OffloadDescriptor::setup_descriptors( if (m_vars[i].type.dst == c_cean_var_ptr) { // array descriptor - const arr_desc *ap = - static_cast<const arr_desc*>(m_vars[i].into); + const Arr_Desc *ap = + static_cast<const Arr_Desc*>(m_vars[i].into); // debug dump - __arr_desc_dump(" ", "INTO", ap, 1); + ARRAY_DESC_DUMP(" ", "INTO", ap, 1, src_is_for_mic); // offset and length are derived from the array descriptor __arr_data_offset_and_length(ap, into_disp, size); @@ -1713,20 +2058,34 @@ bool OffloadDescriptor::setup_descriptors( m_vars[i].flags.sink_addr = 1; } else if (m_vars[i].alloc_if) { + if (m_vars[i].flags.preallocated) { + m_out_datalen += sizeof(void*); + m_need_runfunction = true; + break; + } // add new entry if (!alloc_ptr_data( ptr_data, - base, + reinterpret_cast<char *>(base) + alloc_disp, (alloc_base != NULL) ? alloc_disp : into_disp, (alloc_base != NULL) ? alloc_size : size, alloc_disp, (alloc_base != NULL) ? - 0 : m_vars[i].align)) { + 0 : m_vars[i].align, + m_vars[i].flags.targetptr, + m_vars[i].flags.preallocated, + m_vars[i].flags.pin)) { return false; } - + if (m_vars[i].flags.targetptr) { + if (!init_mic_address(ptr_data)) { + return false; + } + *static_cast<void**>(m_vars[i].into) = base = + reinterpret_cast<void*>(ptr_data->mic_addr); + } if (ptr_data->add_reference() == 0 && ptr_data->mic_buf != 0) { // add buffer to the list of buffers that @@ -1746,7 +2105,8 @@ bool OffloadDescriptor::setup_descriptors( } else { // use existing association from pointer table - if (!find_ptr_data(ptr_data, base, into_disp, size)) { + if (!find_ptr_data(ptr_data, base, into_disp, + size, m_vars[i].flags.targetptr, true)) { return false; } m_vars[i].flags.sink_addr = 1; @@ -1780,7 +2140,7 @@ bool OffloadDescriptor::setup_descriptors( base, into_disp, m_vars[i].size, - false)) { + false, false)) { return false; } } @@ -1806,17 +2166,17 @@ bool OffloadDescriptor::setup_descriptors( if (m_vars[i].direction.bits || m_vars[i].alloc_if || m_vars[i].free_if) { - const arr_desc *ap; + const Arr_Desc *ap; ArrDesc *dvp; PtrData *ptr_data; int64_t disp; int64_t size; if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) { - ap = static_cast<const arr_desc*>(m_vars[i].into); + ap = static_cast<const Arr_Desc*>(m_vars[i].into); // debug dump - __arr_desc_dump(" ", "INTO", ap, 0); + ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic); dvp = (m_vars[i].type.dst == c_dv_data_slice) ? reinterpret_cast<ArrDesc*>(ap->base) : @@ -1889,14 +2249,17 @@ bool OffloadDescriptor::setup_descriptors( // add new entry if (!alloc_ptr_data( ptr_data, - base, + reinterpret_cast<char *>(base) + alloc_disp, (alloc_base != NULL) ? alloc_disp : into_disp, (alloc_base != NULL) ? alloc_size : size, alloc_disp, (alloc_base != NULL) ? - 0 : m_vars[i].align)) { + 0 : m_vars[i].align, + m_vars[i].flags.targetptr, + m_vars[i].flags.preallocated, + m_vars[i].flags.pin)) { return false; } if (ptr_data->add_reference() == 0 && @@ -1918,7 +2281,8 @@ bool OffloadDescriptor::setup_descriptors( } else { // use existing association from pointer table - if (!find_ptr_data(ptr_data, base, into_disp, size)) { + if (!find_ptr_data(ptr_data, base, into_disp, + size, m_vars[i].flags.targetptr, true)) { return false; } @@ -1958,7 +2322,7 @@ bool OffloadDescriptor::setup_descriptors( base, into_disp, size, - false)) { + false, false)) { return false; } into_offset = !ptr_data ? @@ -2062,9 +2426,10 @@ bool OffloadDescriptor::setup_misc_data(const char *name) if (m_func_desc == NULL) LIBOFFLOAD_ERROR(c_malloc); m_func_desc->console_enabled = console_enabled; - m_func_desc->timer_enabled = - timer_enabled || (offload_report_level && offload_report_enabled); - m_func_desc->offload_report_level = offload_report_level; + m_func_desc->timer_enabled = offload_report_enabled && + (timer_enabled || offload_report_level); + m_func_desc->offload_report_level = offload_report_enabled ? + offload_report_level : 0; m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data()); m_func_desc->in_datalen = m_in_datalen; m_func_desc->out_datalen = m_out_datalen; @@ -2078,35 +2443,193 @@ bool OffloadDescriptor::setup_misc_data(const char *name) return true; } +void OffloadDescriptor::setup_omp_async_info() +{ + OFFLOAD_TRACE(2, "setup_omp_async_info\n"); + OmpAsyncLastEventType event_type = m_need_runfunction ? + c_last_runfunc : c_last_write; + int last_in = m_need_runfunction ? 0 : -1; + int i; + + for (i = m_vars_total - 1; i >=0; i--) { + switch (m_vars[i].type.dst) { + case c_data: + case c_void_ptr: + case c_cean_var: + if (m_vars[i].direction.out && + m_vars[i].flags.is_static_dstn) { + event_type = c_last_read; + } + else if (last_in < 0 && m_vars[i].direction.in && + m_vars[i].flags.is_static_dstn) { + last_in = i; + } + break; + case c_string_ptr: + case c_data_ptr: + case c_cean_var_ptr: + case c_dv_ptr: + case c_dv_data: + case c_dv_ptr_data: + case c_dv_data_slice: + case c_dv_ptr_data_slice: + + if (m_vars[i].direction.out) { + event_type = c_last_read; + } + else if (last_in < 0 && m_vars[i].direction.in) { + last_in = i; + } + break; + default: + break; + } + if (event_type == c_last_read) { + break; + } + } + + if (event_type == c_last_read) { + m_vars_extra[i].omp_last_event_type = c_last_read; + } + else if (event_type == c_last_write) { + m_vars_extra[last_in].omp_last_event_type = c_last_write; + } + m_omp_async_last_event_type = event_type; + OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n", + m_omp_async_last_event_type); +} + +extern "C" { + void offload_proxy_task_completed_ooo( + COIEVENT e, + const COIRESULT r, + const void *info + ) + { + /* TODO: Call callback function, pass info. */ + } +} + +void OffloadDescriptor::register_omp_event_call_back( + const COIEVENT *event, + const void *info) +{ + OFFLOAD_TRACE(2, "register_omp_event_call_back(event=%p, info=%p)\n", + event, info); + if (COI::EventRegisterCallback) { + COI::EventRegisterCallback( + *event, + &offload_proxy_task_completed_ooo, + info, 0); + OFFLOAD_TRACE(2, + "COI::EventRegisterCallback found; callback registered\n"); + } +} + bool OffloadDescriptor::wait_dependencies( - const void **waits, - int num_waits + const void **waits, + int num_waits, + _Offload_stream handle ) { OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps); bool ret = true; + OffloadDescriptor *task; + if (num_waits == 0) { + return true; + } - for (int i = 0; i < num_waits; i++) { + // wait for streams + if (num_waits == -1) { + Stream * stream; + // some specific stream of the device + if (handle != 0) { + stream = Stream::find_stream(handle, false); - OffloadDescriptor *task = m_device.find_signal(waits[i], true); - if (task == 0) { - LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(), - waits[i]); - LIBOFFLOAD_ABORT; - } + // the stream was not created or was destroyed + if (!stream) { + LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index()); + LIBOFFLOAD_ABORT; + } + task = stream->get_last_offload(); - if (!task->offload_finish()) { - ret = false; + // offload was completed by previous offload_wait pragma + // or wait clause + if (task == 0) { + return true; + } + if (!task->offload_finish(0)) { //arg is 0 for is_traceback + ret = false; + } + task->cleanup(); + stream->set_last_offload(NULL); + delete task; } + // all streams of the device or over all devices + else { + StreamMap stream_map = Stream::all_streams; + for (StreamMap::iterator it = stream_map.begin(); + it != stream_map.end(); it++) { + Stream * stream = it->second; - task->cleanup(); - delete task; - } + if (!m_wait_all_devices && + stream->get_device() != m_device.get_logical_index()) { + continue; + } + // get associated async task + OffloadDescriptor *task = stream->get_last_offload(); + // offload was completed by offload_wait pragma or wait clause + if (task == 0) { + continue; + } + if (!task->offload_finish(0)) { //arg is 0 for is_traceback + ret = false; + } + task->cleanup(); + stream->set_last_offload(NULL); + delete task; + } + // no uncompleted streams + return true; + } + } + else { + // if handle is equal to no_stream it's wait for signals + for (int i = 0; i < num_waits; i++) { + _Offload_stream stream_handle; + Stream *stream; + task = m_device.find_signal(waits[i], true); + if (task == 0) { + LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(), + waits[i]); + LIBOFFLOAD_ABORT; + } + else if (task == SIGNAL_IS_REMOVED) { + continue; + } + if (!task->offload_finish(0)) { //arg is 0 for is_traceback + ret = false; + } + task->cleanup(); + // if the offload both has signal and is last offload of its + // stream, we must wipe out the "last_offload" reference as + // the offload already is finished. + stream_handle = task->m_stream; + if (stream_handle != -1) { + stream = Stream::find_stream(stream_handle, false); + if (stream && stream->get_last_offload() == task) { + stream->set_last_offload(NULL); + } + } + delete task; + } + } return ret; } -bool OffloadDescriptor::offload( +bool OffloadDescriptor::offload_wrap( const char *name, bool is_empty, VarDesc *vars, @@ -2116,19 +2639,73 @@ bool OffloadDescriptor::offload( int num_waits, const void **signal, int entry_id, - const void *stack_addr + const void *stack_addr, + OffloadFlags offload_flags ) { + OffloadWaitKind wait_kind = c_offload_wait_signal; + bool is_traceback = offload_flags.bits.fortran_traceback; + + // define kind of wait if any; + // there can be one off the following kind: + // 1. c_offload_wait_signal for "offload_wait wait(signal)" + // 2. c_offload_wait_stream for "offload_wait stream(stream)" + // 3. c_offload_wait_all_streams for "offload_wait stream(0)" + if (num_waits == -1) { + wait_kind = (m_stream == 0) ? + c_offload_wait_all_streams : + c_offload_wait_stream; + } + char buf[35]; + const char *stream_str; + + if (m_stream == no_stream || num_waits >= 0) { + stream_str = "none"; + } + else if (m_stream == 0) { + stream_str = "all"; + } + else { + sprintf(buf, "%#llx", m_stream); + stream_str = buf; + } + if (signal == 0) { OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_init_func, "Offload function %s, is_empty=%d, #varDescs=%d, " - "#waits=%d, signal=none\n", - name, is_empty, vars_total, num_waits); - OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), - c_offload_sent_pointer_data, - "#Wait : %d \n", num_waits); + "signal=none, stream=%s, #waits=%d%c", + name, is_empty, vars_total, stream_str, num_waits, + num_waits == 0 ? '\n' : ' '); + // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits + // since the number of waits is not fixed. + if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) { + if (num_waits) { + printf("("); + if (m_stream == no_stream) { + printf("%p", waits[0]); + for (int i = 1; i < num_waits; i++) { + printf(", %p", waits[i]); + } + } + else if (m_stream != 0) { + printf("%#x", m_stream); + } + else { + printf(" all streams"); + } + printf(")"); + } + printf("\n"); + fflush(NULL); + } + // stream in wait is reported further in OFFLOAD_REPORT for waits + if (m_stream != no_stream && num_waits == 0) { + OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), + c_offload_stream, + "%d\n", m_stream); + } OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_signal, "none %d\n", 0); @@ -2138,27 +2715,62 @@ bool OffloadDescriptor::offload( GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_init_func, "Offload function %s, is_empty=%d, #varDescs=%d, " - "#waits=%d, signal=%p\n", - name, is_empty, vars_total, num_waits, - *signal); - + "signal=%p, stream=%s, #waits=%d%c", + name, is_empty, vars_total, *signal, stream_str, num_waits, + num_waits == 0 ? '\n' : ' '); + // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits + // since the number of waits is not fixed. + if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) { + if (num_waits) { + printf("("); + if (m_stream == no_stream) { + printf("%p", waits[0]); + for (int i = 1; i < num_waits; i++) { + printf(", %p", waits[i]); + } + printf(")"); + } + else if (m_stream != 0) { + printf("%#x", m_stream); + } + else { + printf(" all streams"); + } + printf(")"); + } + printf("\n"); + fflush(NULL); + } + // stream in wait is reported further in OFFLOAD_REPORT for waits + if (m_stream != no_stream && num_waits == 0) { + OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), + c_offload_stream, + "%d\n", m_stream); + } OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_signal, "%d\n", signal); } + if (console_enabled >= 1 && offload_flags.flags != 0) { + trace_offload_flags(get_timer_data(), offload_flags); + } + OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()), - c_offload_wait, - "#Wait : %d %p\n", num_waits, waits); + c_offload_wait, "%d\n", + wait_kind, num_waits, + (wait_kind == c_offload_wait_signal) ? + waits : + reinterpret_cast<const void **>(m_stream)); if (m_status != 0) { m_status->result = OFFLOAD_SUCCESS; m_status->device_number = m_device.get_logical_index(); } - m_need_runfunction = !is_empty; + m_initial_need_runfunction = m_need_runfunction = !is_empty; // wait for dependencies to finish - if (!wait_dependencies(waits, num_waits)) { + if (!wait_dependencies(waits, num_waits, m_stream)) { cleanup(); return false; } @@ -2169,8 +2781,13 @@ bool OffloadDescriptor::offload( return false; } + if (offload_flags.bits.omp_async) { + setup_omp_async_info(); + } + // initiate send for pointers. Want to do it as early as possible. - if (!send_pointer_data(signal != 0)) { + if (!send_pointer_data(signal != 0 || offload_flags.bits.omp_async, + signal)) { cleanup(); return false; } @@ -2188,25 +2805,46 @@ bool OffloadDescriptor::offload( } // Start the computation - if (!compute()) { + if (!compute(signal)) { cleanup(); return false; } // initiate receive for pointers - if (!receive_pointer_data(signal != 0)) { + if (!receive_pointer_data(signal != 0 || offload_flags.bits.omp_async, + true, signal)) { cleanup(); return false; } - - // if there is a signal save descriptor for the later use. - if (signal != 0) { - m_device.add_signal(*signal, this); + if (offload_flags.bits.omp_async) { return true; } + // if there is a signal or stream save descriptor for the later use. + // num_waits == -1 is for offload_wait and there is nothing to save + if (num_waits != -1 && (signal != 0 || m_stream != no_stream)) { + if (signal != 0) { + m_device.add_signal(*signal, this); + } + + if (m_stream != no_stream && m_stream != 0) { + Stream* stream = Stream::find_stream(m_stream, false); + if (stream) { + stream->set_last_offload(this); + } + else { + LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index()); + LIBOFFLOAD_ABORT; + } + } + // if there is a clause with alloc_if(1) and preallocated need to call + // offload_finish after runfunction + if (!m_preallocated_alloc) { + return true; + } + } // wait for the offload to finish. - if (!offload_finish()) { + if (!offload_finish(is_traceback)) { cleanup(); return false; } @@ -2215,7 +2853,38 @@ bool OffloadDescriptor::offload( return true; } -bool OffloadDescriptor::offload_finish() +bool OffloadDescriptor::offload( + const char *name, + bool is_empty, + VarDesc *vars, + VarDesc2 *vars2, + int vars_total, + const void **waits, + int num_waits, + const void **signal, + int entry_id, + const void *stack_addr, + OffloadFlags offload_flags +) +{ + bool res; + res = offload_wrap(name, is_empty, vars, vars2, vars_total, + waits, num_waits, signal, entry_id, + stack_addr, offload_flags); + if (res == false && !m_traceback_called) { + if (offload_flags.bits.fortran_traceback) { + OFFLOAD_TRACE(3, + "Calling Fortran library to continue traceback from MIC\n"); + FORTRAN_TRACE_BACK(m_status->result); + m_traceback_called = true; + } + } + return res; +} + +bool OffloadDescriptor::offload_finish( + bool is_traceback +) { COIRESULT res; @@ -2235,10 +2904,24 @@ bool OffloadDescriptor::offload_finish() } if (res != COI_SUCCESS) { - if (m_status != 0) { + if (m_status != 0 && !m_traceback_called) { m_status->result = translate_coi_error(res); + if (is_traceback) { + OFFLOAD_TRACE(3, + "Calling Fortran library to continue traceback from MIC\n"); + FORTRAN_TRACE_BACK(m_status->result); + m_traceback_called = true; + } return false; } + + if (is_traceback && !m_traceback_called) { + OFFLOAD_TRACE(3, + "Calling Fortran library to continue traceback from MIC\n"); + FORTRAN_TRACE_BACK(OFFLOAD_ERROR); + m_traceback_called = true; + } + report_coi_error(c_event_wait, res); } } @@ -2247,6 +2930,13 @@ bool OffloadDescriptor::offload_finish() if (!scatter_copyout_data()) { return false; } + + if (m_out_with_preallocated && + !receive_pointer_data(m_out_deps_total > 0, false, NULL)) { + cleanup(); + return false; + } + // wait for receive dependencies to become signaled if (m_out_deps_total > 0) { OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads); @@ -2320,24 +3010,50 @@ bool OffloadDescriptor::is_signaled() return signaled; } +static Arr_Desc * make_arr_desc( + void* ptr_val, + int64_t extent_start_val, + int64_t extent_elements_val, + int64_t size +) +{ + Arr_Desc *res; + res = (Arr_Desc *)malloc(sizeof(Arr_Desc)); + if (res == NULL) + LIBOFFLOAD_ERROR(c_malloc); + res->base = reinterpret_cast<int64_t>(ptr_val); + res->rank = 1; + res->dim[0].size = size; + res->dim[0].lindex = 0; + res->dim[0].lower = extent_start_val; + res->dim[0].upper = extent_elements_val + extent_start_val - 1; + res->dim[0].stride = 1; + return res; +} + // Send pointer data if source or destination or both of them are // noncontiguous. There is guarantee that length of destination enough for -// transfered data. +// transferred data. bool OffloadDescriptor::send_noncontiguous_pointer_data( int i, PtrData* src_data, PtrData* dst_data, - COIEVENT *event + COIEVENT *event, + uint64_t &data_sent, + uint32_t in_deps_amount, + COIEVENT *in_deps ) { int64_t offset_src, offset_dst; int64_t length_src, length_dst; int64_t length_src_cur, length_dst_cur; - int64_t send_size, data_sent = 0; + int64_t send_size; COIRESULT res; bool dst_is_empty = true; bool src_is_empty = true; + data_sent = 0; + // Set length_src and length_dst length_src = (m_vars_extra[i].read_rng_src) ? m_vars_extra[i].read_rng_src->range_size : m_vars[i].size; @@ -2346,6 +3062,90 @@ bool OffloadDescriptor::send_noncontiguous_pointer_data( m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size; send_size = (length_src < length_dst) ? length_src : length_dst; + // If BufferWriteMultiD is defined we can set values of required arguments + // and transfer noncontiguous data via call to the COI routine. + if (__offload_use_coi_noncontiguous_transfer && COI::BufferWriteMultiD) { + struct Arr_Desc* arr_desc_dst; + struct Arr_Desc* arr_desc_src; + int64_t size_src, size_dst; + char *base = offload_get_src_base(static_cast<char*>(m_vars[i].ptr), + m_vars[i].type.src); + COIBUFFER dst_buf = m_vars[i].into ? + m_vars_extra[i].dst_data->mic_buf : + m_vars_extra[i].src_data->mic_buf; + + offset_src = (m_vars_extra[i].read_rng_src)? + m_vars_extra[i].read_rng_src->init_offset : m_vars_extra[i].cpu_disp; + size_src = m_vars_extra[i].read_rng_src ? + cean_get_transf_size(m_vars_extra[i].read_rng_src) : + m_vars[i].size; + + offset_dst = (m_vars_extra[i].read_rng_dst)? + m_vars_extra[i].read_rng_dst->init_offset : m_vars[i].disp; + size_dst = m_vars_extra[i].read_rng_dst ? + cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size; + + int64_t el_size = (!m_vars[i].into || + (m_vars_extra[i].read_rng_src && m_vars_extra[i].read_rng_dst)) ? + 1 : + m_vars_extra[i].read_rng_src ? + m_vars_extra[i].read_rng_src->arr_desc->dim[ + m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size : + m_vars_extra[i].read_rng_dst->arr_desc->dim[ + m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size; + + arr_desc_src = (m_vars_extra[i].read_rng_src) ? + m_vars_extra[i].read_rng_src->arr_desc : + make_arr_desc(NULL, // don't required for source + offset_src/el_size, size_src/el_size, el_size); + + arr_desc_dst = !m_vars[i].into ? + arr_desc_src : + (m_vars_extra[i].read_rng_dst) ? + m_vars_extra[i].read_rng_dst->arr_desc : + make_arr_desc(NULL, + offset_dst/el_size, size_src/el_size, el_size); + + int64_t alloc_disp = m_vars[i].into ? + m_vars_extra[i].dst_data->alloc_disp : + m_vars_extra[i].src_data->alloc_disp; + + arr_desc_src->base = reinterpret_cast<int64_t>(base); + arr_desc_dst->base = 0; + + res = COI::BufferWriteMultiD( + dst_buf, // in_DestBuffer, + m_device.get_process(), // DestProcess, + m_vars[i].offset + m_vars[i].mic_offset - + alloc_disp, // Offset + (void*)arr_desc_dst, // descriptor of DestArray + (void*)arr_desc_src, // descriptor of SrcArray + COI_COPY_UNSPECIFIED, // Type + in_deps_amount, // Number of in Dependencies + in_deps, // array of in Dependencies + event); // out Dependency + if (res != COI_SUCCESS) { + if (m_status != 0) { + m_status->result = translate_coi_error(res); + return false; + } + report_coi_error(c_buf_copy, res); + } + return(true); + } + + // if event is defined we must multiplate it for all contiguous intervals + // that will be Copied/Write. + // Take in account that we already have 1 event. + if (event) { + m_in_deps_allocated += (length_src / send_size) * + ((m_vars_extra[i].read_rng_src) ? + m_vars_extra[i].read_rng_src->range_max_number : 1) ; + m_in_deps = + (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * m_in_deps_allocated); + m_in_deps_total--; + } + // consequently get contiguous ranges, // define corresponded destination offset and send data do { @@ -2402,17 +3202,20 @@ bool OffloadDescriptor::send_noncontiguous_pointer_data( } length_dst_cur -= send_size; dst_is_empty = length_dst_cur == 0; - + + if (event) { + event = &m_in_deps[m_in_deps_total++]; + } if (src_data != 0 && src_data->cpu_buf != 0) { res = COI::BufferCopy( dst_data->mic_buf, src_data->cpu_buf, - m_vars[i].mic_offset - dst_data->alloc_disp + + m_vars[i].mic_offset + m_vars[i].offset + offset_dst, m_vars_extra[i].cpu_offset + offset_src, send_size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2428,12 +3231,12 @@ bool OffloadDescriptor::send_noncontiguous_pointer_data( res = COI::BufferWrite( dst_data->mic_buf, - m_vars[i].mic_offset - dst_data->alloc_disp + + m_vars[i].mic_offset + m_vars[i].offset + offset_dst, base + offset_src, send_size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2443,21 +3246,87 @@ bool OffloadDescriptor::send_noncontiguous_pointer_data( report_coi_error(c_buf_write, res); } } - data_sent += length_src; + data_sent += send_size; } while (true); return true; } -bool OffloadDescriptor::send_pointer_data(bool is_async) +bool OffloadDescriptor::send_pointer_data(bool is_async, void* info) { OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers); + bool should_use_async_buffer_write = m_initial_need_runfunction; uint64_t ptr_sent = 0; COIRESULT res; + uint32_t in_deps_amount = 0; + COIEVENT *in_deps = NULL; + + // For offload_transfer and offload with empty body without signal: + // - if there is only one buffer copy - send data synchronously + // - if there are multiple buffer copy and + // __offload_parallel_copy is false - send data synchronously + // - if there are multiple buffer copy and + // __offload_parallel_copy is true - send data asynchronously + // It concerns only big size data - greater than __offload_use_async_buffer_write. + // Data of size less than __offload_use_async_buffer_write are sent synchronously. + // Synchronous transfer results in better performance in COI. + // __offload_parallel_copy is false by default but can be changed + // via environment variable OFFLOAD_PARALLEL_COPY + if (!m_initial_need_runfunction && __offload_parallel_copy) { + int big_size_count = 0; + for (int i = 0; i < m_vars_total; i++) { + if (m_vars[i].direction.in && + m_vars[i].size >= __offload_use_async_buffer_write) { + switch (m_vars[i].type.dst) { + case c_data: + case c_void_ptr: + case c_cean_var: + if (m_vars[i].flags.is_static_dstn) { + big_size_count++; + } + break; + case c_string_ptr: + case c_data_ptr: + case c_cean_var_ptr: + case c_dv_ptr: + case c_dv_data: + case c_dv_ptr_data: + case c_dv_data_slice: + case c_dv_ptr_data_slice: + big_size_count++; + break; + default: + break; + } + } + } + if (big_size_count > 1) { + should_use_async_buffer_write = true; + } + } + + if (m_stream != no_stream && m_vars_total != 0) { + get_stream_in_dependencies(in_deps_amount, in_deps); + } // Initiate send for pointer data for (int i = 0; i < m_vars_total; i++) { + uint64_t sent_data = m_vars[i].size; + uint32_t in_deps_amount_save; + COIEVENT *in_deps_save; + + if (m_vars_extra[i].omp_last_event_type == c_last_write) { + in_deps_amount_save = in_deps_amount; + in_deps_save = in_deps; + in_deps_amount = m_in_deps_total; + if (in_deps_amount > 0) { + in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * in_deps_amount); + if (in_deps == NULL) + LIBOFFLOAD_ERROR(c_malloc); + memcpy(in_deps, m_in_deps,in_deps_amount * sizeof(COIEVENT)); + } + } switch (m_vars[i].type.dst) { case c_data_ptr_array: break; @@ -2468,7 +3337,8 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) m_vars[i].flags.is_static_dstn) { COIEVENT *event = (is_async || - m_vars[i].size >= __offload_use_async_buffer_write) ? + (should_use_async_buffer_write && + m_vars[i].size >= __offload_use_async_buffer_write)) ? &m_in_deps[m_in_deps_total++] : 0; PtrData* dst_data = m_vars[i].into ? m_vars_extra[i].dst_data : @@ -2482,7 +3352,8 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { if (!send_noncontiguous_pointer_data( - i, src_data, dst_data, event)) { + i, src_data, dst_data, event, sent_data, + in_deps_amount, in_deps)) { return false; } } @@ -2490,13 +3361,13 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) res = COI::BufferCopy( dst_data->mic_buf, src_data->cpu_buf, - m_vars[i].mic_offset - dst_data->alloc_disp + + m_vars[i].mic_offset + m_vars[i].offset + m_vars[i].disp, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2511,12 +3382,12 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) m_vars[i].type.src); res = COI::BufferWrite( dst_data->mic_buf, - m_vars[i].mic_offset - dst_data->alloc_disp + + m_vars[i].mic_offset + m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2526,7 +3397,7 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) report_coi_error(c_buf_write, res); } } - ptr_sent += m_vars[i].size; + ptr_sent += sent_data; } break; @@ -2537,7 +3408,8 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) if (m_vars[i].direction.in && m_vars[i].size > 0) { COIEVENT *event = (is_async || - m_vars[i].size >= __offload_use_async_buffer_write) ? + (should_use_async_buffer_write && + m_vars[i].size >= __offload_use_async_buffer_write)) ? &m_in_deps[m_in_deps_total++] : 0; PtrData* dst_data = m_vars[i].into ? m_vars_extra[i].dst_data : @@ -2551,19 +3423,20 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { send_noncontiguous_pointer_data( - i, src_data, dst_data, event); + i, src_data, dst_data, event, sent_data, + in_deps_amount, in_deps); } else if (src_data != 0 && src_data->cpu_buf != 0) { res = COI::BufferCopy( dst_data->mic_buf, src_data->cpu_buf, - m_vars[i].mic_offset - dst_data->alloc_disp + + m_vars[i].mic_offset + m_vars[i].offset + m_vars[i].disp, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2578,12 +3451,12 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) m_vars[i].type.src); res = COI::BufferWrite( dst_data->mic_buf, - m_vars[i].mic_offset - dst_data->alloc_disp + + m_vars[i].mic_offset + m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2594,7 +3467,7 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) } } - ptr_sent += m_vars[i].size; + ptr_sent += sent_data; } break; @@ -2609,26 +3482,27 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) COIEVENT *event = (is_async || - m_vars[i].size >= __offload_use_async_buffer_write) ? + (should_use_async_buffer_write && + m_vars[i].size >= __offload_use_async_buffer_write)) ? &m_in_deps[m_in_deps_total++] : 0; if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { send_noncontiguous_pointer_data( - i, src_data, ptr_data, event); + i, src_data, ptr_data, event, sent_data, + in_deps_amount, in_deps); } else if (src_data && src_data->cpu_buf != 0) { res = COI::BufferCopy( ptr_data->mic_buf, src_data->cpu_buf, - m_vars[i].offset + ptr_data->mic_offset - - ptr_data->alloc_disp + + m_vars[i].offset + ptr_data->mic_offset + m_vars[i].disp, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2643,12 +3517,12 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) m_vars[i].type.src); res = COI::BufferWrite( ptr_data->mic_buf, - ptr_data->mic_offset - ptr_data->alloc_disp + + ptr_data->mic_offset + m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2658,7 +3532,7 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) report_coi_error(c_buf_write, res); } } - ptr_sent += m_vars[i].size; + ptr_sent += sent_data; } break; @@ -2678,25 +3552,27 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) m_vars_extra[i].src_data : 0; COIEVENT *event = (is_async || - m_vars[i].size >= __offload_use_async_buffer_write) ? + (should_use_async_buffer_write && + m_vars[i].size >= __offload_use_async_buffer_write)) ? &m_in_deps[m_in_deps_total++] : 0; if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { send_noncontiguous_pointer_data( - i, src_data, dst_data, event); + i, src_data, dst_data, event, sent_data, + in_deps_amount, in_deps); } else if (src_data && src_data->cpu_buf != 0) { res = COI::BufferCopy( dst_data->mic_buf, src_data->cpu_buf, - m_vars[i].offset - dst_data->alloc_disp + + m_vars[i].offset + dst_data->mic_offset + m_vars[i].disp, m_vars_extra[i].cpu_offset + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2711,12 +3587,12 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) m_vars[i].type.src); res = COI::BufferWrite( dst_data->mic_buf, - dst_data->mic_offset - dst_data->alloc_disp + + dst_data->mic_offset + m_vars[i].offset + m_vars[i].disp, base + m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - 0, 0, + in_deps_amount, in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -2727,14 +3603,18 @@ bool OffloadDescriptor::send_pointer_data(bool is_async) } } - ptr_sent += m_vars[i].size; + ptr_sent += sent_data; } break; default: break; } - + if (m_vars_extra[i].omp_last_event_type == c_last_write) { + in_deps_amount = in_deps_amount_save; + in_deps = in_deps_save; + register_omp_event_call_back(&m_in_deps[m_in_deps_total - 1], info); + } // alloc field isn't used at target. // We can reuse it for offset of array pointers. if (m_vars_extra[i].is_arr_ptr_el) { @@ -2901,7 +3781,7 @@ bool OffloadDescriptor::gather_copyin_data() return true; } -bool OffloadDescriptor::compute() +bool OffloadDescriptor::compute(void *info) { OffloadTimer timer(get_timer_data(), c_offload_host_start_compute); @@ -2926,12 +3806,21 @@ bool OffloadDescriptor::compute() // dispatch task COIRESULT res; COIEVENT event; - res = m_device.compute(m_compute_buffers, + uint32_t in_deps_amount = m_in_deps_total; + COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0; + + if (0 == m_in_deps_total && m_stream != no_stream) { + get_stream_in_dependencies(in_deps_amount, in_deps); + } + + res = m_device.compute(m_stream, + m_compute_buffers, misc, misc_len, ret, ret_len, - m_in_deps_total, - m_in_deps_total > 0 ? m_in_deps : 0, + in_deps_amount, + in_deps, &event); + if (res != COI_SUCCESS) { if (m_status != 0) { m_status->result = translate_coi_error(res); @@ -2940,6 +3829,10 @@ bool OffloadDescriptor::compute() report_coi_error(c_pipeline_run_func, res); } + if (m_omp_async_last_event_type == c_last_runfunc) { + register_omp_event_call_back(&event, info); + } + m_in_deps_total = 1; m_in_deps[0] = event; } @@ -2947,34 +3840,114 @@ bool OffloadDescriptor::compute() return true; } -// recieve pointer data if source or destination or both of them are +// receive pointer data if source or destination or both of them are // noncontiguous. There is guarantee that length of destination enough for -// transfered data. -bool OffloadDescriptor::recieve_noncontiguous_pointer_data( +// transferred data. +bool OffloadDescriptor::receive_noncontiguous_pointer_data( int i, - char* base, COIBUFFER dst_buf, - COIEVENT *event + COIEVENT *event, + uint64_t &received_data, + uint32_t in_deps_amount, + COIEVENT *in_deps ) { int64_t offset_src, offset_dst; int64_t length_src, length_dst; int64_t length_src_cur, length_dst_cur; - int64_t recieve_size, data_recieved = 0; + int64_t receive_size; COIRESULT res; bool dst_is_empty = true; bool src_is_empty = true; + char *base = offload_get_src_base( + m_vars[i].into ? + static_cast<char*>(m_vars[i].into) : + static_cast<char*>(m_vars[i].ptr), + m_vars[i].type.dst); + received_data = 0; + // Set length_src and length_dst length_src = (m_vars_extra[i].read_rng_src) ? m_vars_extra[i].read_rng_src->range_size : m_vars[i].size; length_dst = !m_vars[i].into ? length_src : (m_vars_extra[i].read_rng_dst) ? m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size; - recieve_size = (length_src < length_dst) ? length_src : length_dst; - + receive_size = (length_src < length_dst) ? length_src : length_dst; + + // If BufferReadMultiD is defined we can set values of required arguments + // and transfer noncontiguous data via call to the COI routine. + if (__offload_use_coi_noncontiguous_transfer && COI::BufferReadMultiD) { + struct Arr_Desc* arr_desc_dst; + struct Arr_Desc* arr_desc_src; + int64_t size_src, size_dst; + + offset_src = (m_vars_extra[i].read_rng_src)? + m_vars_extra[i].read_rng_src->init_offset : m_vars[i].disp; + size_src = m_vars_extra[i].read_rng_src ? + cean_get_transf_size(m_vars_extra[i].read_rng_src) : + m_vars[i].size; + + offset_dst = (m_vars_extra[i].read_rng_dst)? + m_vars_extra[i].read_rng_dst->init_offset : m_vars_extra[i].cpu_disp; + size_dst = m_vars_extra[i].read_rng_dst ? + cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size; + + int64_t el_size = (!m_vars[i].into || + (m_vars_extra[i].read_rng_src && + m_vars_extra[i].read_rng_dst)) ? + 1 : + m_vars_extra[i].read_rng_src ? + m_vars_extra[i].read_rng_src->arr_desc->dim[ + m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size : + m_vars_extra[i].read_rng_dst->arr_desc->dim[ + m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size; + arr_desc_src = (m_vars_extra[i].read_rng_src) ? + m_vars_extra[i].read_rng_src->arr_desc : + make_arr_desc(NULL, // don't required for source + offset_src/el_size, size_src/el_size, + el_size); + arr_desc_dst = !m_vars[i].into ? arr_desc_src : + (m_vars_extra[i].read_rng_dst) ? + m_vars_extra[i].read_rng_dst->arr_desc : + make_arr_desc(NULL, + offset_dst/el_size, size_src/el_size, el_size); + + arr_desc_dst->base = reinterpret_cast<int64_t>(base); + + res = COI::BufferReadMultiD( + m_vars_extra[i].src_data->mic_buf, // SourceBuffer + m_vars[i].offset + m_vars[i].mic_offset - + m_vars_extra[i].src_data->alloc_disp, // Offset + (void*)arr_desc_dst, // descriptor of DestArray + (void*)arr_desc_src, // descriptor of SrcArray + COI_COPY_UNSPECIFIED, // Type + in_deps_amount, // Number of in Dependencies + in_deps, // array of in Dependencies + event); // out Dependency + if (res != COI_SUCCESS) { + if (m_status != 0) { + m_status->result = translate_coi_error(res); + return false; + } + report_coi_error(c_buf_copy, res); + } + return(true); + } + // if event is defined we must multiplate for all contiguous intervals + // that will be Copied/Read. + // Take in account that we already have 1 event. + if (event) { + m_out_deps_allocated += (length_src / receive_size) * + ((m_vars_extra[i].read_rng_src) ? + m_vars_extra[i].read_rng_src->range_max_number : 1) ; + m_out_deps = + (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_out_deps_allocated); + m_out_deps_total--; + } + // consequently get contiguous ranges, - // define corresponded destination offset and recieve data + // define corresponded destination offset and receive data do { // get sorce offset if (src_is_empty) { @@ -2985,8 +3958,8 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data( break; } } - else if (data_recieved == 0) { - offset_src = 0; + else if (received_data == 0) { + offset_src = m_vars[i].disp; } else { break; @@ -2996,9 +3969,9 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data( else { // if source is contiguous or its contiguous range is greater // than destination one - offset_src += recieve_size; + offset_src += receive_size; } - length_src_cur -= recieve_size; + length_src_cur -= receive_size; src_is_empty = length_src_cur == 0; // get destination offset @@ -3027,23 +4000,24 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data( else { // if destination is contiguous or its contiguous range is greater // than source one - offset_dst += recieve_size; + offset_dst += receive_size; } - length_dst_cur -= recieve_size; + length_dst_cur -= receive_size; dst_is_empty = length_dst_cur == 0; - + if (event) { + event = &m_out_deps[m_out_deps_total++]; + } if (dst_buf != 0) { res = COI::BufferCopy( dst_buf, m_vars_extra[i].src_data->mic_buf, m_vars_extra[i].cpu_offset + offset_dst, m_vars[i].offset + offset_src + - m_vars[i].mic_offset - - m_vars_extra[i].src_data->alloc_disp, - recieve_size, + m_vars[i].mic_offset, + receive_size, COI_COPY_UNSPECIFIED, - m_in_deps_total, - m_in_deps_total > 0 ? m_in_deps : 0, + in_deps_amount, + in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -3057,13 +4031,12 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data( res = COI::BufferRead( m_vars_extra[i].src_data->mic_buf, m_vars[i].offset + offset_src + - m_vars[i].mic_offset - - m_vars_extra[i].src_data->alloc_disp, + m_vars[i].mic_offset, base + offset_dst, - recieve_size, + receive_size, COI_COPY_UNSPECIFIED, - m_in_deps_total, - m_in_deps_total > 0 ? m_in_deps : 0, + in_deps_amount, + in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -3073,20 +4046,109 @@ bool OffloadDescriptor::recieve_noncontiguous_pointer_data( report_coi_error(c_buf_read, res); } } - data_recieved += recieve_size; + received_data += receive_size; } while (true); return true; } -bool OffloadDescriptor::receive_pointer_data(bool is_async) +bool OffloadDescriptor::receive_pointer_data(bool is_async, + bool first_run, void *info) { OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads); + bool should_use_async_buffer_read = m_initial_need_runfunction; uint64_t ptr_received = 0; COIRESULT res; + // For offload_transfer and offload with empty body without signal: + // - if there is only one buffer copy - get data synchronously + // - if there are multiple buffer copy and + // __offload_parallel_copy is false - get data synchronously + // - if there are multiple buffer copy + // and __offload_parallel_copy is true - get data asynchronously + // It concerns only data with size greater than __offload_use_async_buffer_read. + // Data of size less than __offload_use_async_buffer_read are received synchronously. + // Synchronous transfer results in better performance in COI. + // __offload_parallel_copy is false by default but can be changed + // via environment variable OFFLOAD_PARALLEL_COPY + if (!m_initial_need_runfunction && __offload_parallel_copy) { + int big_size_count = 0; + + for (int i = 0; i < m_vars_total; i++) { + if (m_vars[i].direction.out && + m_vars[i].size >= __offload_use_async_buffer_read) { + // preallocated OUT only at second run + if (first_run == m_vars[i].flags.preallocated) { + continue; + } + switch (m_vars[i].type.src) { + case c_data: + case c_void_ptr: + case c_cean_var: + if (m_vars[i].flags.is_static) { + big_size_count++; + } + break; + case c_string_ptr: + case c_data_ptr: + case c_cean_var_ptr: + case c_dv_data: + case c_dv_ptr_data: + case c_dv_data_slice: + case c_dv_ptr_data_slice: + case c_dv_ptr: + big_size_count++; + break; + default: + break; + } + } + } + if (big_size_count > 1) { + should_use_async_buffer_read = true; + } + } + uint32_t in_deps_amount = m_in_deps_total; + COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0; + + if (0 == m_in_deps_total && + m_stream != no_stream && + m_vars_total != 0) { + get_stream_in_dependencies(in_deps_amount, in_deps); + } + for (int i = 0; i < m_vars_total; i++) { + uint64_t received_data = m_vars[i].size; + uint32_t in_deps_amount_save; + COIEVENT *in_deps_save; + + if (m_vars_extra[i].omp_last_event_type == c_last_read) { + in_deps_amount_save = in_deps_amount; + in_deps_save = in_deps; + + in_deps_amount += m_out_deps_total; + if (in_deps_amount > 0) { + in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * in_deps_amount); + if (in_deps == NULL) + LIBOFFLOAD_ERROR(c_malloc); + memcpy(in_deps, in_deps_save, + in_deps_amount_save * sizeof(COIEVENT)); + memcpy(in_deps + in_deps_amount_save * sizeof(COIEVENT), + m_out_deps, + m_out_deps_total * sizeof(COIEVENT)); + } + } + // At first run don't receive by preallocated target pointer as the + //pointer value will be ready later after call to scatter_copyout_data + if (first_run && m_vars[i].alloc_if && m_vars[i].flags.preallocated) { + m_preallocated_alloc = true; + // need one more call to OffloadDescriptor::receive_pointer_data + if (m_vars[i].direction.out) { + m_out_with_preallocated = true; + } + continue; + } switch (m_vars[i].type.src) { case c_data_ptr_array: break; @@ -3098,7 +4160,8 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) COIEVENT *event = (is_async || m_in_deps_total > 0 || - m_vars[i].size >= __offload_use_async_buffer_read) ? + (should_use_async_buffer_read && + m_vars[i].size >= __offload_use_async_buffer_read)) ? &m_out_deps[m_out_deps_total++] : 0; PtrData *ptr_data = NULL; COIBUFFER dst_buf = NULL; // buffer at host @@ -3127,8 +4190,9 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { - recieve_noncontiguous_pointer_data( - i, base, dst_buf, event); + receive_noncontiguous_pointer_data( + i, dst_buf, event, received_data, + in_deps_amount, in_deps); } else if (dst_buf != 0) { res = COI::BufferCopy( @@ -3139,8 +4203,8 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) m_vars[i].offset + m_vars[i].disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - m_in_deps_total, - m_in_deps_total > 0 ? m_in_deps : 0, + in_deps_amount, + in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -3158,8 +4222,8 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) m_vars_extra[i].cpu_disp, m_vars[i].size, COI_COPY_UNSPECIFIED, - m_in_deps_total, - m_in_deps_total > 0 ? m_in_deps : 0, + in_deps_amount, + in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -3169,7 +4233,7 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) report_coi_error(c_buf_read, res); } } - ptr_received += m_vars[i].size; + ptr_received += received_data; } break; @@ -3186,7 +4250,8 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) COIEVENT *event = (is_async || m_in_deps_total > 0 || - m_vars[i].size >= __offload_use_async_buffer_read) ? + (should_use_async_buffer_read && + m_vars[i].size >= __offload_use_async_buffer_read)) ? &m_out_deps[m_out_deps_total++] : 0; uint64_t dst_offset = 0; @@ -3241,8 +4306,10 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) if (m_vars[i].flags.is_noncont_src || m_vars[i].flags.is_noncont_dst) { - recieve_noncontiguous_pointer_data( - i, base, dst_buf, event); + receive_noncontiguous_pointer_data( + i, dst_buf, event, received_data, + in_deps_amount, + in_deps); } else if (dst_buf != 0) { res = COI::BufferCopy( @@ -3250,12 +4317,11 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) m_vars_extra[i].src_data->mic_buf, dst_offset, m_vars[i].offset + m_vars[i].disp + - m_vars[i].mic_offset - - m_vars_extra[i].src_data->alloc_disp, + m_vars[i].mic_offset, m_vars[i].size, COI_COPY_UNSPECIFIED, - m_in_deps_total, - m_in_deps_total > 0 ? m_in_deps : 0, + in_deps_amount, + in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -3269,13 +4335,12 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) res = COI::BufferRead( m_vars_extra[i].src_data->mic_buf, m_vars[i].offset + m_vars[i].disp + - m_vars[i].mic_offset - - m_vars_extra[i].src_data->alloc_disp, + m_vars[i].mic_offset, base + dst_offset, m_vars[i].size, COI_COPY_UNSPECIFIED, - m_in_deps_total, - m_in_deps_total > 0 ? m_in_deps : 0, + in_deps_amount, + in_deps, event); if (res != COI_SUCCESS) { if (m_status != 0) { @@ -3285,7 +4350,7 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) report_coi_error(c_buf_read, res); } } - ptr_received += m_vars[i].size; + ptr_received += received_data; } break; } @@ -3294,6 +4359,11 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) break; } + if (m_vars_extra[i].omp_last_event_type == c_last_read) { + in_deps_amount = in_deps_amount_save; + in_deps = in_deps_save; + register_omp_event_call_back(&m_out_deps[m_out_deps_total - 1], info); + } // destroy buffers for obsolete stacks if (m_destroy_stack.size() != 0) { for (PtrDataList::iterator it = m_destroy_stack.begin(); @@ -3312,8 +4382,13 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) m_vars[i].type.src == c_void_ptr || m_vars[i].type.src == c_cean_var)) { AutoData *auto_data = m_vars_extra[i].auto_data; - if (auto_data != 0 && auto_data->remove_reference() == 0) { - m_device.remove_auto_data(auto_data->cpu_addr.start()); + if (auto_data != 0) { + if (m_vars[i].flags.always_delete) { + auto_data->nullify_reference(); + } + else if(auto_data->remove_reference() == 0) { + m_device.remove_auto_data(auto_data->cpu_addr.start()); + } } } @@ -3338,7 +4413,12 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) ptr_data->cpu_addr.start()); // remove association from map - m_device.remove_ptr_data(ptr_data->cpu_addr.start()); + if (m_vars[i].flags.targetptr) { + m_device.remove_targetptr_data(ptr_data->cpu_addr.start()); + } + else { + m_device.remove_ptr_data(ptr_data->cpu_addr.start()); + } } } else if (VAR_TYPE_IS_PTR(m_vars[i].type.dst) || @@ -3357,7 +4437,12 @@ bool OffloadDescriptor::receive_pointer_data(bool is_async) ptr_data->cpu_addr.start()); // remove association from map - m_device.remove_ptr_data(ptr_data->cpu_addr.start()); + if (m_vars[i].flags.targetptr) { + m_device.remove_targetptr_data(ptr_data->cpu_addr.start()); + } + else { + m_device.remove_ptr_data(ptr_data->cpu_addr.start()); + } } } } @@ -3416,6 +4501,60 @@ bool OffloadDescriptor::scatter_copyout_data() m_out.init_buffer(data, m_out_datalen); for (int i = 0; i < m_vars_total; i++) { + bool src_is_for_mic = (m_vars[i].direction.out || + m_vars[i].into == NULL); + + if (m_vars[i].type.src != c_data_ptr_array && + m_vars[i].flags.preallocated && m_vars[i].alloc_if) { + PtrData *ptr_data; + void *ptr_value; + void ** cpu_ptr = src_is_for_mic ? + reinterpret_cast<void**>(m_vars[i].ptr) : + reinterpret_cast<void**>(m_vars[i].into); + void* alloc_base = NULL; + int64_t alloc_disp = 0; + int64_t alloc_size; + if (m_vars_extra[i].alloc != NULL) { + // array descriptor + const Arr_Desc *ap = + static_cast<const Arr_Desc*>(m_vars_extra[i].alloc); + + __arr_data_offset_and_length(ap, alloc_disp, alloc_size); + + alloc_base = reinterpret_cast<void*>(ap->base); + } + + // get pointer to target memory + m_out.receive_data(&ptr_value, sizeof(void*)); + + // add new entry + if (!alloc_ptr_data( + ptr_data, + ptr_value, + (alloc_base != NULL) ? + alloc_disp : m_vars[i].disp, + (alloc_base != NULL) ? + alloc_size : m_vars[i].size, + alloc_disp, + 0, + m_vars[i].flags.targetptr, + m_vars[i].flags.preallocated, + m_vars[i].flags.pin)) { + return false; + } + + ptr_data->add_reference(); + *cpu_ptr = ptr_value; + if (src_is_for_mic) { + m_vars_extra[i].src_data = ptr_data; + } + else { + m_vars_extra[i].dst_data = ptr_data; + } + m_vars[i].offset = (char*) ptr_value - + (char*) ptr_data->cpu_addr.start(); + } + switch (m_vars[i].type.src) { case c_data_ptr_array: break; @@ -3478,8 +4617,8 @@ bool OffloadDescriptor::scatter_copyout_data() return true; } -void get_arr_desc_numbers( - const arr_desc *ap, +static void get_arr_desc_numbers( + const Arr_Desc *ap, int64_t el_size, int64_t &offset, int64_t &size, @@ -3500,33 +4639,12 @@ void get_arr_desc_numbers( } } -arr_desc * make_arr_desc( - void* ptr_val, - int64_t extent_start_val, - int64_t extent_elements_val, - int64_t size -) -{ - arr_desc *res; - res = (arr_desc *)malloc(sizeof(arr_desc)); - if (res == NULL) - LIBOFFLOAD_ERROR(c_malloc); - res->base = reinterpret_cast<int64_t>(ptr_val); - res->rank = 1; - res->dim[0].size = size; - res->dim[0].lindex = 0; - res->dim[0].lower = extent_start_val; - res->dim[0].upper = extent_elements_val + extent_start_val - 1; - res->dim[0].stride = 1; - return res; -} - bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) { int pointers_number; int tmp_val; int new_index = m_vars_total; - const arr_desc *ap; + const Arr_Desc *ap; const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr); int flags = vd3->array_fields; bool src_is_for_mic = (m_vars[i].direction.out || @@ -3545,14 +4663,16 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) ReadArrElements<int64_t> alloc_elem; - ap = static_cast<const arr_desc*>(vd3->ptr_array); + ap = static_cast<const Arr_Desc*>(vd3->ptr_array); - // "pointers_number" for total number of transfered pointers. + // "pointers_number" for total number of transferred pointers. // For each of them we create new var_desc and put it at the bottom // of the var_desc's array get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size, pointers_number, ptr.ranges); - ptr.base = reinterpret_cast<char*>(ap->base); + ptr.base = (m_vars[i].flags.is_pointer) ? + *(reinterpret_cast<char**>(ap->base)) : + reinterpret_cast<char*>(ap->base); // 2. prepare memory for new var_descs m_vars_total += pointers_number; @@ -3575,7 +4695,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) // 3. Prepare for reading new var_desc's fields // EXTENT START if ((flags & (1<<flag_extent_start_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->extent_start); + ap = static_cast<const Arr_Desc*>(vd3->extent_start); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset, ext_start.size, tmp_val, ext_start.ranges); ext_start.base = reinterpret_cast<char*>(ap->base); @@ -3595,7 +4715,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) // EXTENT ELEMENTS NUMBER if ((flags & (1<<flag_extent_elements_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->extent_elements); + ap = static_cast<const Arr_Desc*>(vd3->extent_elements); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_elements.offset, ext_elements.size, tmp_val, ext_elements.ranges); @@ -3616,7 +4736,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) // ALLOC_IF if ((flags & (1<<flag_alloc_if_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->alloc_if_array); + ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset, alloc_if.size, tmp_val, alloc_if.ranges); alloc_if.base = reinterpret_cast<char*>(ap->base); @@ -3628,12 +4748,12 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) } } else { - alloc_if.val = m_vars[i].count; + alloc_if.val = m_vars[i].alloc_if; } // FREE_IF if ((flags & (1<<flag_free_if_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->free_if_array); + ap = static_cast<const Arr_Desc*>(vd3->free_if_array); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset, free_if.size, tmp_val, free_if.ranges); free_if.base = reinterpret_cast<char*>(ap->base); @@ -3645,13 +4765,13 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) } } else { - free_if.val = m_vars[i].count; + free_if.val = m_vars[i].free_if; } // ALIGN if ((flags & (1<<flag_align_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->align_array); + ap = static_cast<const Arr_Desc*>(vd3->align_array); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset, align.size, tmp_val, align.ranges); align.base = reinterpret_cast<char*>(ap->base); @@ -3669,7 +4789,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) // 3.1 INTO if (m_vars[i].into) { - ap = static_cast<const arr_desc*>(m_vars[i].into); + ap = static_cast<const Arr_Desc*>(m_vars[i].into); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset, into.size, tmp_val, into.ranges); into.base = reinterpret_cast<char*>(ap->base); @@ -3683,7 +4803,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) // 3.2 INTO_START if ((flags & (1<<flag_into_start_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->into_start); + ap = static_cast<const Arr_Desc*>(vd3->into_start); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset, into_start.size, tmp_val, into_start.ranges); into_start.base = reinterpret_cast<char*>(ap->base); @@ -3704,7 +4824,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) // 3.3 INTO_ELEMENTS if ((flags & (1<<flag_into_elements_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->into_elements); + ap = static_cast<const Arr_Desc*>(vd3->into_elements); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset, into_elem.size, tmp_val, into_elem.ranges); into_elem.base = reinterpret_cast<char*>(ap->base); @@ -3725,7 +4845,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) // alloc_start if ((flags & (1<<flag_alloc_start_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->alloc_start); + ap = static_cast<const Arr_Desc*>(vd3->alloc_start); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_start.offset, alloc_start.size, tmp_val, alloc_start.ranges); @@ -3747,7 +4867,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) // alloc_elem if ((flags & (1<<flag_alloc_elements_is_array)) != 0) { - ap = static_cast<const arr_desc*>(vd3->alloc_elements); + ap = static_cast<const Arr_Desc*>(vd3->alloc_elements); get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset, alloc_elem.size, tmp_val, alloc_elem.ranges); alloc_elem.base = reinterpret_cast<char*>(ap->base); @@ -3846,6 +4966,9 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) m_vars[new_index + k].flags.bits = m_vars[i].flags.bits; m_vars[new_index + k].offset = 0; m_vars[new_index + k].size = m_vars[i].size; + m_vars[new_index + k].flags.targetptr = m_vars[i].flags.targetptr; + m_vars[new_index + k].flags.preallocated = + m_vars[i].flags.preallocated; if (ext_start.val == 0) { m_vars[new_index + k].count = ext_elements.val; @@ -3901,6 +5024,7 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) m_vars[new_index + k].type.src = type_src; m_vars[new_index + k].type.dst = type_dst; + m_vars_extra[new_index + k].alloc = m_vars[new_index + k].alloc; m_vars_extra[new_index + k].is_arr_ptr_el = 1; m_vars_extra[new_index + k].ptr_arr_offset = src_is_for_mic ? ptr.offset : into.offset; @@ -3912,6 +5036,52 @@ bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i) return true; } +// Gets in dependencies of the previous offload via the stream "m_stream". +// Out argument in_deps_amount - address of amount of the dependencies +// Out argument in_deps - array of dependencies. +// Description of the dependencies scheme for streams : +// ---------------------------------------------------- +// Every offload forms DAG consisted of 3 nodes: +// for in-transfers, runfunction and out-transfers. +// Every node has in-dependencies and out-dependencies +// Out-dependencies of previous node forms in-dependencies of current node. +// In-dependencies of 1-st node (of in-transfers) without streams is equal +// to NULL. For streams in-dependencies of 1-st node is equal to list of out +// dependencies of last node of previous offload via this stream. +// So we can say that DAGs of 2 consequent offloads via the same stream are +// connected by the way described above. +void OffloadDescriptor::get_stream_in_dependencies( + uint32_t &in_deps_amount, + COIEVENT* &in_deps +) +{ + if (m_stream != no_stream && m_stream != 0) { + Stream * stream = Stream::find_stream(m_stream, false); + if (!stream) { + LIBOFFLOAD_ERROR(c_offload_no_stream, + m_device.get_logical_index()); + LIBOFFLOAD_ABORT; + } + OffloadDescriptor* offload = stream->get_last_offload(); + + // if it's the first offload in the stream + if (!offload) { + return; + } + // if last offload has out-tranfers + if (offload->m_out_deps_total) { + in_deps_amount = offload->m_out_deps_total; + in_deps = offload->m_out_deps; + } + // last offload only sends pointer data or run function or both of them + // and has no out-transfers + else if (offload->m_in_deps_total) { + in_deps_amount = offload->m_in_deps_total; + in_deps = offload->m_in_deps; + } + } +} + static void __offload_fini_library(void) { OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n"); @@ -3945,7 +5115,6 @@ static void __offload_init_library_once(void) COIRESULT res; uint32_t num_devices; std::bitset<MIC_ENGINES_MAX> devices; - prefix = report_get_message_str(c_report_host); // initialize trace @@ -3989,7 +5158,7 @@ static void __offload_init_library_once(void) } // get number of devices installed in the system - res = COI::EngineGetCount(COI_ISA_KNC, &num_devices); + res = COI::EngineGetCount(COI_ISA_MIC, &num_devices); if (res != COI_SUCCESS) { return; } @@ -4032,7 +5201,7 @@ static void __offload_init_library_once(void) // use all available devices for (int i = 0; i < num_devices; i++) { COIENGINE engine; - res = COI::EngineGetHandle(COI_ISA_KNC, i, &engine); + res = COI::EngineGetHandle(COI_ISA_MIC, i, &engine); if (res == COI_SUCCESS) { devices.set(i); } @@ -4055,12 +5224,64 @@ static void __offload_init_library_once(void) } } + // Get DMA channel count to pass it to COI + env_var = getenv("OFFLOAD_DMA_CHANNEL_COUNT"); + if (env_var != 0) { + int64_t new_val; + if (__offload_parse_int_string(env_var, new_val)) { + mic_dma_channel_count = new_val; + } + else { + LIBOFFLOAD_ERROR(c_invalid_env_var_value, + "OFFLOAD_DMA_CHANNEL_COUNT"); + } + } + + // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set. + // Use putenv instead of setenv as Windows has no setenv. + // Note: putenv requires its argument can't be freed or modified. + // So no free after call to putenv or elsewhere. + env_var = getenv("OFFLOAD_HOST_THREAD_AFFINITY"); + if (env_var != 0) { + char * new_env_var = + (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") + + sizeof(env_var) + 1); + sprintf(new_env_var, "COI_HOST_THREAD_AFFINITY=%s", env_var); + putenv(new_env_var); + } + // library search path for device binaries env_var = getenv("MIC_LD_LIBRARY_PATH"); if (env_var != 0) { mic_library_path = strdup(env_var); } + + // find target executable to be used if main application is not an + // offload build application. + const char *base_name = "offload_main"; + if (mic_library_path != 0) { + char *buf = strdup(mic_library_path); + char *try_name = (char*) alloca(strlen(mic_library_path) + + strlen(base_name) + 2); + char *dir, *ptr; + + for (dir = strtok_r(buf, PATH_SEPARATOR, &ptr); dir != 0; + dir = strtok_r(0, PATH_SEPARATOR, &ptr)) { + // compose a full path + sprintf(try_name, "%s/%s", dir, base_name); + + // check if such file exists + struct stat st; + if (stat(try_name, &st) == 0 && S_ISREG(st.st_mode)) { + mic_device_main = strdup(try_name); + break; + } + } + + free(buf); + } + // memory size reserved for COI buffers env_var = getenv("MIC_BUFFERSIZE"); if (env_var != 0) { @@ -4073,6 +5294,30 @@ static void __offload_init_library_once(void) } } + // memory size reserved for 4K pages for COI buffers + env_var = getenv("MIC_4K_BUFFER_RESERVE_SIZE"); + if (env_var != 0) { + uint64_t new_size; + if (__offload_parse_size_string(env_var, new_size)) { + mic_4k_buffer_size = new_size; + } + else { + LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_4K_BUFFER_RESERVE_SIZE"); + } + } + + // memory size reserved for 2M pages for COI buffers + env_var = getenv("MIC_2M_BUFFER_RESERVE_SIZE"); + if (env_var != 0) { + uint64_t new_size; + if (__offload_parse_size_string(env_var, new_size)) { + mic_2m_buffer_size = new_size; + } + else { + LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_2M_BUFFER_RESERVE_SIZE"); + } + } + // determine stacksize for the pipeline on the device env_var = getenv("MIC_STACKSIZE"); if (env_var != 0 && *env_var != '\0') { @@ -4170,11 +5415,9 @@ static void __offload_init_library_once(void) else if (strcmp(env_var, "on_offload_all") == 0) { __offload_init_type = c_init_on_offload_all; } -#ifndef TARGET_WINNT else if (strcmp(env_var, "on_start") == 0) { __offload_init_type = c_init_on_start; } -#endif // TARGET_WINNT else { LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname); } @@ -4206,6 +5449,32 @@ static void __offload_init_library_once(void) } } + // parallel copy of offload_transfer + env_var = getenv(parallel_copy_envname); + if (env_var != 0 && *env_var != '\0') { + int64_t new_val; + if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) { + __offload_parallel_copy = new_val; + } + else { + LIBOFFLOAD_ERROR(c_invalid_env_var_value, + parallel_copy_envname); + } + } + + // use COI interface for noncontiguous arrays transfer + env_var = getenv(use_coi_noncontiguous_transfer_envname); + if (env_var != 0 && *env_var != '\0') { + uint64_t new_size; + if (__offload_parse_size_string(env_var, new_size)) { + __offload_use_coi_noncontiguous_transfer = new_size; + } + else { + LIBOFFLOAD_ERROR(c_invalid_env_var_value, + use_coi_noncontiguous_transfer_envname); + } + } + // init ORSL ORSL::init(); } @@ -4242,7 +5511,20 @@ extern int __offload_init_library(void) return is_available; } -extern "C" void __offload_register_image(const void *target_image) +extern "C" bool __offload_target_image_is_executable(const void *target_image) +{ + const struct Image *image = static_cast<const struct Image*>(target_image); + + // decode image + const char *name = image->data; + const void *data = image->data + strlen(image->data) + 1; + + // determine image type + const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data); + return (hdr->e_type == ET_EXEC); +} + +extern "C" bool __offload_register_image(const void *target_image) { const struct Image *image = static_cast<const struct Image*>(target_image); @@ -4250,8 +5532,32 @@ extern "C" void __offload_register_image(const void *target_image) const char *name = image->data; const void *data = image->data + strlen(image->data) + 1; uint64_t size = image->size; - const char *origin = 0; + char *origin = (char *) malloc(strlen(image->data) + 1); uint64_t offset = 0; + const char *host_name = image->data; + int i; + + if (origin == NULL) + LIBOFFLOAD_ERROR(c_malloc); + + // The origin name is the name of the file on the host + // this is used by Vtune, since it is a fat binary we + // use the host file name of the fat binary. + // Driver prepends the host file name ending with "?" + // to the image->data name so need to extract the string + i = 0; + while (*host_name != '\0' && *host_name != '?') { + origin[i] = *host_name; + host_name++; + i++; + } + origin[i] = '\0'; + // Implies the host name does not exist which really should + // not occur. Allow this since only consumer is Vtune. + if ((i == 0) || (*host_name != '?')) { + free(origin); + origin = 0; + } // our actions depend on the image type const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data); @@ -4279,19 +5585,31 @@ extern "C" void __offload_register_image(const void *target_image) } } } - break; + return mic_engines_total > 0; case ET_DYN: - // Registration code for libraries is called from the DllMain - // context (on windows) and thus we cannot do anything usefull - // here. So we just add it to the list of pending libraries for - // the later use. - __target_libs_lock.lock(); - __target_libs = true; - __target_libs_list.push_back(TargetImage(name, data, size, - origin, offset)); - __target_libs_lock.unlock(); - break; + { + char *fullname = origin; + // We add the library to a list of pending libraries + __target_libs_lock.lock(); + __target_libs = true; + __target_libs_list.push_back( + TargetImage(name, data, size, fullname, offset)); + __target_libs_lock.unlock(); + // If __target_exe is set, then main has started running + // If not main, then we can't do anything useful here + // because this registration code is called from DllMain + // context (on windows). + if (__target_exe != 0) { + // There is no need to delay loading the library + if (!__offload_init_library()) { + // Couldn't validate library as a fat offload library + LIBOFFLOAD_ERROR(c_unknown_binary_type); + exit(1); + } + } + return true; + } default: // something is definitely wrong, issue an error and exit @@ -4330,6 +5648,12 @@ extern "C" void __offload_unregister_image(const void *target_image) __offload_fini_library(); } + else if (hdr->e_type == ET_DYN) { + for (int i = 0; i < mic_engines_total; i++) { + mic_engines[i].unload_library(data, name); + } + + } } // Runtime trace interface for user programs @@ -4362,19 +5686,24 @@ int _Offload_signaled(int index, void *signal) __offload_init_library(); // check index value - if (index < 0 || mic_engines_total <= 0) { + if (index < 0) { LIBOFFLOAD_ERROR(c_offload_signaled1, index); LIBOFFLOAD_ABORT; } + index %= mic_engines_total; + // find associated async task OffloadDescriptor *task = - mic_engines[index % mic_engines_total].find_signal(signal, false); + mic_engines[index].find_signal(signal, false); if (task == 0) { LIBOFFLOAD_ERROR(c_offload_signaled2, signal); LIBOFFLOAD_ABORT; } - + // if signal is removed by wait completing + else if (task == SIGNAL_IS_REMOVED) { + return (true); + } return task->is_signaled(); } @@ -4386,6 +5715,153 @@ void _Offload_report(int val) } } +int _Offload_find_associated_mic_memory( + int target, + const void* cpu_addr, + void** cpu_base_addr, + uint64_t* buf_length, + void** mic_addr, + uint64_t* mic_buf_start_offset, + int* is_static +) +{ + __offload_init_library(); + + // check target value + if (target < 0) { + LIBOFFLOAD_ERROR(c_offload_signaled1, target); + LIBOFFLOAD_ABORT; + } + target %= mic_engines_total; + + // find existing association in pointer table + PtrData* ptr_data = mic_engines[target].find_ptr_data(cpu_addr); + if (ptr_data == 0) { + OFFLOAD_TRACE(3, "Association does not exist\n"); + return 0; + } + + OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n", + ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(), + ptr_data->is_static); + + if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) { + COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf, + &ptr_data->mic_addr); + if (res != COI_SUCCESS) { + return 0; + } + } + *cpu_base_addr = const_cast<void *>(ptr_data->cpu_addr.start()); + *buf_length = ptr_data->cpu_addr.length() - ptr_data->alloc_disp; + *mic_addr = (void *)(ptr_data->mic_addr + ptr_data->mic_offset); + *mic_buf_start_offset = ptr_data->alloc_disp; + *is_static = ptr_data->is_static; + return ptr_data->is_static ? 1 : ptr_data->get_reference(); +} + +_Offload_stream _Offload_stream_create( + int device, // MIC device number + int number_of_cpus // Cores allocated to the stream + ) +{ + __offload_init_library(); + + // check target value + if (device < 0) { + LIBOFFLOAD_ERROR(c_offload_signaled1, device); + LIBOFFLOAD_ABORT; + } + device %= mic_engines_total; + + // Create new stream and get its handle + _Offload_stream handle = Stream::add_stream(device, number_of_cpus); + if (handle == 0) { + OFFLOAD_TRACE(3, "Can't create stream\n"); + return 0; + } + + // create pipeline associated with the new stream + mic_engines[device].get_pipeline(handle); + + return(handle); +} + +int _Offload_stream_destroy( + int device, // MIC device number + _Offload_stream handle // stream to destroy + ) +{ + __offload_init_library(); + + // check target value + if (device < 0) { + LIBOFFLOAD_ERROR(c_offload_signaled1, device); + LIBOFFLOAD_ABORT; + } + device %= mic_engines_total; + + mic_engines[device].stream_destroy(handle); + + return(true); +} + +int _Offload_stream_completed(int device, _Offload_stream handler) +{ + __offload_init_library(); + + // check index value + if (device < 0) { + LIBOFFLOAD_ERROR(c_offload_signaled1, device); + LIBOFFLOAD_ABORT; + } + + device %= mic_engines_total; + + // get stream + Stream * stream; + + if (handler != 0) { + stream = Stream::find_stream(handler, false); + + // the stream was not created or was destroyed + if (!stream) { + LIBOFFLOAD_ERROR(c_offload_no_stream, device); + LIBOFFLOAD_ABORT; + } + + // find associated async task + OffloadDescriptor *task = stream->get_last_offload(); + + // offload was completed by offload_wait pragma or wait clause + if (task == 0) { + return(true); + } + return task->is_signaled(); + } + // zero handler is for all streams at the device + else { + StreamMap stream_map = Stream::all_streams; + for (StreamMap::iterator it = stream_map.begin(); + it != stream_map.end(); it++) { + Stream * stream = it->second; + // find associated async task + OffloadDescriptor *task = stream->get_last_offload(); + + // offload was completed by offload_wait pragma or wait clause + if (task == 0) { + return(true); + } + // if even one stream is not completed result is false + if (!task->is_signaled()) { + return false; + } + } + // no uncompleted streams + return true; + } +} + // IDB support int __dbg_is_attached = 0; int __dbg_target_id = -1; |