diff options
Diffstat (limited to 'liboffloadmic/runtime/offload_omp_host.cpp')
-rw-r--r-- | liboffloadmic/runtime/offload_omp_host.cpp | 786 |
1 files changed, 784 insertions, 2 deletions
diff --git a/liboffloadmic/runtime/offload_omp_host.cpp b/liboffloadmic/runtime/offload_omp_host.cpp index 1449847..0439fec 100644 --- a/liboffloadmic/runtime/offload_omp_host.cpp +++ b/liboffloadmic/runtime/offload_omp_host.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved. + Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -29,9 +29,11 @@ #include <omp.h> -#include "offload.h" +//#include <stdlib.h> +//#include "offload.h" #include "compiler_if_host.h" + // OpenMP API void omp_set_default_device(int num) __GOMP_NOTHROW @@ -52,6 +54,786 @@ int omp_get_num_devices() __GOMP_NOTHROW return mic_engines_total; } +// OpenMP 4.5 APIs + +// COI supports 3-dim multiD transfers +#define MAX_ARRAY_RANK 3 + +int omp_get_initial_device( + void +) __GOMP_NOTHROW +{ + return -1; +} + +void* omp_target_alloc( + size_t size, + int device_num +) __GOMP_NOTHROW +{ + __offload_init_library(); + + OFFLOAD_TRACE(2, "omp_target_alloc(%lld, %d)\n", size, device_num); + + if (device_num < -1) { + LIBOFFLOAD_ERROR(c_invalid_device_number); + exit(1); + } + + void* result = 0; + + // malloc on CPU + if (device_num == -1) { + // We do not check for malloc returning NULL because the + // specification of this API includes the possibility of failure. + // The user will check the returned result + result = malloc(size); + return result; + } + + OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE( + TARGET_MIC, device_num, 0, NULL, __func__, 0); + if (ofld != 0) { + VarDesc vars[2] = {0}; + + vars[0].type.src = c_data; + vars[0].type.dst = c_data; + vars[0].direction.bits = c_parameter_in; + vars[0].size = sizeof(size); + vars[0].count = 1; + vars[0].ptr = &size; + + vars[1].type.src = c_data; + vars[1].type.dst = c_data; + vars[1].direction.bits = c_parameter_out; + vars[1].size = sizeof(result); + vars[1].count = 1; + vars[1].ptr = &result; + + OFFLOAD_OFFLOAD(ofld, "omp_target_alloc_target", + 0, 2, vars, NULL, 0, 0, 0); + } + return result; +} + +void omp_target_free( + void *device_ptr, + int device_num +) __GOMP_NOTHROW +{ + __offload_init_library(); + + OFFLOAD_TRACE(2, "omp_target_free(%p, %d)\n", device_ptr, device_num); + + if (device_num < -1) { + LIBOFFLOAD_ERROR(c_invalid_device_number); + exit(1); + } + + // free on CPU + if (device_num == -1) { + free(device_ptr); + return; + } + + OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE( + TARGET_MIC, device_num, 0, NULL, __func__, 0); + if (ofld) { + VarDesc vars[1] = {0}; + + vars[0].type.src = c_data; + vars[0].type.dst = c_data; + vars[0].direction.bits = c_parameter_in; + vars[0].size = sizeof(device_ptr); + vars[0].count = 1; + vars[0].ptr = &device_ptr; + + OFFLOAD_OFFLOAD(ofld, "omp_target_free_target", + 0, 1, vars, NULL, 0, 0, 0); + } +} + +int omp_target_is_present( + void *ptr, + int device_num +) __GOMP_NOTHROW +{ + __offload_init_library(); + + OFFLOAD_TRACE(2, "omp_target_is_present(%p, %d)\n", ptr, device_num); + + if (device_num < -1) { + LIBOFFLOAD_ERROR(c_invalid_device_number); + exit(1); + } + + if (device_num == -1) { + return false; + } + + // If OpenMP allows wrap-around for device numbers, enable next line + //device_num %= mic_engines_total; + + // lookup existing association in pointer table + PtrData* ptr_data = mic_engines[device_num].find_ptr_data(ptr); + if (ptr_data == 0) { + OFFLOAD_TRACE(3, "Address %p is not mapped on device %d\n", + ptr, device_num); + return false; + } + + OFFLOAD_TRACE(3, "Address %p found mapped on device %d\n", + ptr, device_num); + return true; +} + +int omp_target_memcpy( + void *dst, + void *src, + size_t length, + size_t dst_offset, + size_t src_offset, + int dst_device, + int src_device +) __GOMP_NOTHROW +{ + __offload_init_library(); + + OFFLOAD_TRACE(2, "omp_target_memcpy(%p, %p, %lld, %lld, %lld, %d, %d)\n", + dst, src, length, dst_offset, src_offset, dst_device, src_device); + + if (dst_device < -1 || src_device < -1) { + LIBOFFLOAD_ERROR(c_invalid_device_number); + exit(1); + } + + char* srcp = (char *)src + src_offset; + char* dstp = (char *)dst + dst_offset; + + if (src_device == -1) { + // Source is CPU + if (dst_device == -1) { + // CPU -> CPU + memcpy(dstp, srcp, length); + return 0; + } else { + // CPU -> MIC + // COIBufferWrite + // If OpenMP allows wrap-around for device numbers, enable next line + //dst_device %= mic_engines_total; + + OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n", dstp); + COIBUFFER mic_buf; + COIRESULT res = COI::BufferCreateFromMemory(length, + COI_BUFFER_NORMAL, COI_SINK_MEMORY, dstp, + 1, &mic_engines[dst_device].get_process(), + &mic_buf); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_create_from_mem, res); + return 1; + } + res = COI::BufferWrite(mic_buf, 0, srcp, length, + COI_COPY_UNSPECIFIED, 0, 0, 0); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_write, res); + return 1; + } + res = COI::BufferDestroy(mic_buf); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_destroy, res); + return 1; + } + return 0; + } + } else { + // Source is device + if (dst_device == -1) { + // MIC -> CPU + // COIBufferRead + + // If OpenMP allows wrap-around for device numbers, enable next line + //src_device %= mic_engines_total; + + OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n", srcp); + COIBUFFER mic_buf; + COIRESULT res = COI::BufferCreateFromMemory(length, + COI_BUFFER_NORMAL, COI_SINK_MEMORY, srcp, + 1, &mic_engines[src_device].get_process(), + &mic_buf); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_create_from_mem, res); + return 1; + } + res = COI::BufferRead(mic_buf, 0, dstp, length, + COI_COPY_UNSPECIFIED, 0, 0, 0); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_read, res); + return 1; + } + res = COI::BufferDestroy(mic_buf); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_destroy, res); + return 1; + } + return 0; + } else { + // some MIC -> some MIC + if (src_device == dst_device) { + // MIC local copy will be done as remote memcpy + + OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(TARGET_MIC, src_device, + 0, NULL, __func__, 0); + if (ofld) { + VarDesc vars[3] = {0}; + + vars[0].type.src = c_data; + vars[0].type.dst = c_data; + vars[0].direction.bits = c_parameter_in; + vars[0].size = sizeof(dstp); + vars[0].count = 1; + vars[0].ptr = &dstp; + + vars[1].type.src = c_data; + vars[1].type.dst = c_data; + vars[1].direction.bits = c_parameter_in; + vars[1].size = sizeof(srcp); + vars[1].count = 1; + vars[1].ptr = &srcp; + + vars[2].type.src = c_data; + vars[2].type.dst = c_data; + vars[2].direction.bits = c_parameter_in; + vars[2].size = sizeof(length); + vars[2].count = 1; + vars[2].ptr = &length; + + OFFLOAD_OFFLOAD(ofld, "omp_target_memcpy_target", + 0, 3, vars, NULL, 0, 0, 0); + return 0; + } else { + return 1; + } + } else { + // MICx -> MICy + // Allocate CPU buffer + char *cpu_mem = (char *)malloc(length); + if (cpu_mem == 0) { + LIBOFFLOAD_ERROR(c_malloc); + return 1; + } + int retval = 1; + if (omp_target_memcpy( + cpu_mem, srcp, length, 0, 0, -1, src_device) == 0) { + retval = omp_target_memcpy( + dstp, cpu_mem, length, 0, 0, dst_device, -1); + } + free(cpu_mem); + return retval; + } + } + } +} + +static size_t bytesize_at_this_dimension( + size_t element_size, + int num_dims, + const size_t* dimensions +) +{ + if (num_dims > 1) { + return dimensions[1] * + bytesize_at_this_dimension( + element_size, num_dims-1, dimensions+1); + } else { + return element_size; + } +} + +static void memcpy_rect( + char *dst, + char *src, + size_t element_size, + int num_dims, + const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions +) +{ + if (num_dims > 1) { + int count = volume[0]; + int dst_index = dst_offsets[0]; + int src_index = src_offsets[0]; + size_t dst_element_size = + bytesize_at_this_dimension(element_size, num_dims, dst_dimensions); + size_t src_element_size = + bytesize_at_this_dimension(element_size, num_dims, src_dimensions); + for (; count>0; dst_index++, src_index++, count--) { + memcpy_rect(dst+dst_element_size*dst_index, + src+src_element_size*src_index, + element_size, num_dims-1, volume+1, + dst_offsets+1, src_offsets+1, + dst_dimensions+1, src_dimensions+1); + } + } else { + memcpy(dst+dst_offsets[0]*element_size, + src+src_offsets[0]*element_size, + element_size * volume[0]); + } +} + +int omp_target_memcpy_rect( + void *dst_, + void *src_, + size_t element_size, + int num_dims, + const size_t *volume, + const size_t *dst_offsets, + const size_t *src_offsets, + const size_t *dst_dimensions, + const size_t *src_dimensions, + int dst_device, + int src_device +) __GOMP_NOTHROW +{ + char *dst = (char *)dst_; + char *src = (char *)src_; + + __offload_init_library(); + + OFFLOAD_TRACE(2, "omp_target_memcpy_rect(%p, %p, %lld, %d, " + "%p, %p, %p, %p, %p, %d, %d)\n", + dst, src, element_size, num_dims, + volume, dst_offsets, src_offsets, + dst_dimensions, src_dimensions, dst_device, src_device); + + // MAX_ARRAY_RANK dimensions are supported + if (dst == 0 && src == 0) { + return MAX_ARRAY_RANK; + } + + if (num_dims < 1 || num_dims > MAX_ARRAY_RANK || + element_size < 1 || + volume == 0 || dst_offsets == 0 || src_offsets == 0 || + dst_dimensions == 0 || src_dimensions == 0) { + return 1; + } + + if (dst_device < -1 || src_device < -1) { + LIBOFFLOAD_ERROR(c_invalid_device_number); + exit(1); + } + + if (src_device == -1) { + // Source is CPU + if (dst_device == -1) { + // CPU -> CPU + memcpy_rect((char*)dst, (char*)src, element_size, num_dims, volume, + dst_offsets, src_offsets, + dst_dimensions, src_dimensions); + return 0; + } else { + // CPU -> MIC + // COIBufferWriteMultiD + struct arr_desc dst_desc; + struct arr_desc src_desc; + + dst_desc.base = (int64_t)dst; + dst_desc.rank = num_dims; + + src_desc.base = (int64_t)src; + src_desc.rank = num_dims; + + for (int i=0; i<num_dims; i++) + { + dst_desc.dim[i].size = bytesize_at_this_dimension( + element_size, + num_dims - i, + dst_dimensions + i); + dst_desc.dim[i].lindex = 0; + dst_desc.dim[i].lower = dst_offsets[i]; + dst_desc.dim[i].upper = dst_offsets[i] + volume[i] - 1; + dst_desc.dim[i].stride = 1; + + src_desc.dim[i].size = bytesize_at_this_dimension( + element_size, + num_dims - i, + src_dimensions + i); + src_desc.dim[i].lindex = 0; + src_desc.dim[i].lower = src_offsets[i]; + src_desc.dim[i].upper = src_offsets[i] + volume[i] - 1; + src_desc.dim[i].stride = 1; + } + __arr_desc_dump("", "dst", (const Arr_Desc*)&dst_desc, false, false); + __arr_desc_dump("", "src", (const Arr_Desc*)&src_desc, false, false); + + // If OpenMP allows wrap-around for device numbers, enable next line + //dst_device %= mic_engines_total; + + // Compute MIC buffer size + size_t dst_length = dst_dimensions[0] * bytesize_at_this_dimension( + element_size, + num_dims, + dst_dimensions); + + OFFLOAD_TRACE(3, + "Creating buffer from sink memory %llx of size %lld\n", + dst, dst_length); + COIBUFFER mic_buf; + COIRESULT res = COI::BufferCreateFromMemory(dst_length, + COI_BUFFER_NORMAL, COI_SINK_MEMORY, dst, + 1, &mic_engines[dst_device].get_process(), + &mic_buf); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_create_from_mem, res); + return 1; + } + res = COI::BufferWriteMultiD(mic_buf, + mic_engines[dst_device].get_process(), + 0, &dst_desc, &src_desc, + COI_COPY_UNSPECIFIED, 0, 0, 0); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_write, res); + return 1; + } + res = COI::BufferDestroy(mic_buf); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_destroy, res); + return 1; + } + return 0; + } + } else { + // Source is device + if (dst_device == -1) { + // COIBufferReadMultiD + struct arr_desc dst_desc; + struct arr_desc src_desc; + + dst_desc.base = (int64_t)dst; + dst_desc.rank = num_dims; + + src_desc.base = (int64_t)src; + src_desc.rank = num_dims; + + for (int i=0; i<num_dims; i++) + { + dst_desc.dim[i].size = bytesize_at_this_dimension( + element_size, + num_dims - i, + dst_dimensions + i); + dst_desc.dim[i].lindex = 0; + dst_desc.dim[i].lower = dst_offsets[i]; + dst_desc.dim[i].upper = dst_offsets[i] + volume[i] - 1; + dst_desc.dim[i].stride = 1; + + src_desc.dim[i].size = bytesize_at_this_dimension( + element_size, + num_dims - i, + src_dimensions + i); + src_desc.dim[i].lindex = 0; + src_desc.dim[i].lower = src_offsets[i]; + src_desc.dim[i].upper = src_offsets[i] + volume[i] - 1; + src_desc.dim[i].stride = 1; + } + __arr_desc_dump("", "dst", (const Arr_Desc*)&dst_desc, false, false); + __arr_desc_dump("", "src", (const Arr_Desc*)&src_desc, false, false); + + // If OpenMP allows wrap-around for device numbers, enable next line + //src_device %= mic_engines_total; + + // Compute MIC buffer size + size_t src_length = src_dimensions[0] * bytesize_at_this_dimension( + element_size, + num_dims, + src_dimensions); + + OFFLOAD_TRACE(3, + "Creating buffer from sink memory %llx of size %lld\n", + src, src_length); + COIBUFFER mic_buf; + COIRESULT res = COI::BufferCreateFromMemory(src_length, + COI_BUFFER_NORMAL, COI_SINK_MEMORY, src, + 1, &mic_engines[src_device].get_process(), + &mic_buf); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_create_from_mem, res); + return 1; + } + res = COI::BufferReadMultiD(mic_buf, 0, + &dst_desc, &src_desc, + COI_COPY_UNSPECIFIED, 0, 0, 0); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_write, res); + return 1; + } + res = COI::BufferDestroy(mic_buf); + if (res != COI_SUCCESS) { + LIBOFFLOAD_ERROR(c_buf_destroy, res); + return 1; + } + return 0; + } else { + // some MIC -> some MIC + if (src_device == dst_device) { + // MIC local copy will be done as remote memcpy_rect + struct parameters { + void *dst; + void *src; + size_t element_size; + int num_dims; + size_t array_info[MAX_ARRAY_RANK*5]; + } parameters = {dst, src, element_size, num_dims}; + int result; + + for (int i=0; i<num_dims; i++) + { + parameters.array_info[i] = volume[i]; + parameters.array_info[i+num_dims] = dst_offsets[i]; + parameters.array_info[i+num_dims*2] = src_offsets[i]; + parameters.array_info[i+num_dims*3] = dst_dimensions[i]; + parameters.array_info[i+num_dims*4] = src_dimensions[i]; + } + + OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(TARGET_MIC, src_device, + 0, NULL, __func__, 0); + if (ofld) { + VarDesc vars[1] = {0}; + + vars[0].type.src = c_data; + vars[0].type.dst = c_data; + vars[0].direction.bits = c_parameter_in; + vars[0].size = sizeof(parameters) - + (MAX_ARRAY_RANK - num_dims) * + 5 * sizeof(size_t); + vars[0].count = 1; + vars[0].ptr = ¶meters; + + OFFLOAD_OFFLOAD(ofld, "omp_target_memcpy_rect_target", + 0, 1, vars, NULL, 0, 0, 0); + return 0; + } else { + return 1; + } + } else { + // MICx -> MICy + + // Compute transfer byte-count + size_t dst_length = element_size; + for (int i=0; i<num_dims; i++) { + dst_length *= volume[i]; + } + + // Allocate CPU buffer + char *cpu_mem = (char *)malloc(dst_length); + if (cpu_mem == 0) { + LIBOFFLOAD_ERROR(c_malloc); + return 1; + } + + // Create CPU offset and dimension arrays + // The CPU array collects the data in a contiguous block + size_t cpu_offsets[MAX_ARRAY_RANK]; + size_t cpu_dimensions[MAX_ARRAY_RANK]; + for (int i=0; i<num_dims; i++) { + cpu_offsets[i] = 0; + cpu_dimensions[i] = volume[i]; + } + + int retval = 1; + if (omp_target_memcpy_rect( + cpu_mem, src, element_size, num_dims, volume, + cpu_offsets, src_offsets, + cpu_dimensions, src_dimensions, + -1, src_device) == 0) { + retval = omp_target_memcpy_rect( + dst, cpu_mem, element_size, num_dims, volume, + dst_offsets, cpu_offsets, + dst_dimensions, cpu_dimensions, + dst_device, -1); + } + free(cpu_mem); + return retval; + } + } + } +} + +// host_ptr is key in table that yields association on device +// A COIBUFFER of specified size is created from the memory at +// device_ptr+device_offset on device_num +int omp_target_associate_ptr( + void *host_ptr, + void *device_ptr, + size_t size, + size_t device_offset, + int device_num +) __GOMP_NOTHROW +{ + COIRESULT res; + + __offload_init_library(); + + OFFLOAD_TRACE(2, "omp_target_associate_ptr(%p, %p, %lld, %lld, %d)\n", + host_ptr, device_ptr, size, device_offset, device_num); + + if (device_num < -1) { + LIBOFFLOAD_ERROR(c_invalid_device_number); + exit(1); + } + + // Associating to CPU is treated as failure + if (device_num == -1) { + return 1; + } + + // An incorrect size is treated as failure + if (size < 0) { + return 1; + } + + // If OpenMP allows wrap-around for device numbers, enable next line + //Engine& device = mic_engines[device_num % mic_engines_total]; + Engine& device = mic_engines[device_num]; + + // Does host pointer have association already? + // lookup existing association in pointer table + PtrData* ptr_data = device.find_ptr_data(host_ptr); + if (ptr_data != 0) { + OFFLOAD_TRACE(3, "Address %p is already mapped on device %d\n", + host_ptr, device_num); + // Is current device pointer and offset same as existing? + if ((void*)ptr_data->mic_addr == device_ptr && + (size_t)ptr_data->alloc_disp == device_offset) { + return 0; + } else { + return 1; + } + } + + // Create association + OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n", + host_ptr, size); + + bool is_new; + ptr_data = device.insert_ptr_data(host_ptr, size, is_new); + ptr_data->is_omp_associate = true; + + // create CPU buffer + OFFLOAD_TRACE(3, + "Creating buffer from source memory %p, length %lld\n", + host_ptr, size); + + // result is not checked because we can continue without cpu + // buffer. In this case we will use COIBufferRead/Write + // instead of COIBufferCopy. + + COI::BufferCreateFromMemory(size, + COI_BUFFER_OPENCL, + 0, + host_ptr, + 1, + &device.get_process(), + &ptr_data->cpu_buf); + + // create MIC buffer + OFFLOAD_TRACE(3, + "Creating buffer from sink memory: addr %p, size %lld\n", + (char *)device_ptr + device_offset, size); + res = COI::BufferCreateFromMemory(size, + COI_BUFFER_NORMAL, + COI_SINK_MEMORY, + device_ptr, + 1, + &device.get_process(), + &ptr_data->mic_buf); + if (res != COI_SUCCESS) { + ptr_data->alloc_ptr_data_lock.unlock(); + return 1; + } + + // make buffer valid on the device. + res = COI::BufferSetState(ptr_data->mic_buf, + device.get_process(), + COI_BUFFER_VALID, + COI_BUFFER_NO_MOVE, + 0, 0, 0); + if (res != COI_SUCCESS) { + ptr_data->alloc_ptr_data_lock.unlock(); + return 1; + } + + res = COI::BufferSetState(ptr_data->mic_buf, + COI_PROCESS_SOURCE, + COI_BUFFER_INVALID, + COI_BUFFER_NO_MOVE, + 0, 0, 0); + if (res != COI_SUCCESS) { + ptr_data->alloc_ptr_data_lock.unlock(); + return 1; + } + ptr_data->alloc_disp = device_offset; + ptr_data->alloc_ptr_data_lock.unlock(); + + return 0; +} + +int omp_target_disassociate_ptr( + void *host_ptr, + int device_num +) __GOMP_NOTHROW +{ + COIRESULT res; + + __offload_init_library(); + + OFFLOAD_TRACE(2, "omp_target_disassociate_ptr(%p, %d)\n", + host_ptr, device_num); + + if (device_num < -1) { + LIBOFFLOAD_ERROR(c_invalid_device_number); + exit(1); + } + + // Dissociating from CPU is treated as failure + if (device_num == -1) { + return 1; + } + + // If OpenMP allows wrap-around for device numbers, enable next line + //Engine& device = mic_engines[device_num % mic_engines_total]; + Engine& device = mic_engines[device_num]; + + // Lookup existing association in pointer table + PtrData* ptr_data = device.find_ptr_data(host_ptr); + + // Attempt to disassociate unassociated pointer is a failure + if (ptr_data == 0) { + return 1; + } + + // Destroy buffers + if (ptr_data->cpu_buf != 0) { + OFFLOAD_TRACE(3, "Destroying CPU buffer %p\n", ptr_data->cpu_buf); + COI::BufferDestroy(ptr_data->cpu_buf); + } + if (ptr_data->mic_buf != 0) { + OFFLOAD_TRACE(3, "Destroying MIC buffer %p\n", ptr_data->mic_buf); + COI::BufferDestroy(ptr_data->mic_buf); + } + + // Remove association from map + OFFLOAD_TRACE(3, "Removing association for addr %p\n", + ptr_data->cpu_addr.start()); + device.remove_ptr_data(ptr_data->cpu_addr.start()); + + return 0; +} + +// End of OpenMP 4.5 APIs + + // OpenMP API wrappers static void omp_set_int_target( |