/* Test dispatch of events to callbacks. */ /* { dg-additional-options "--param=openacc-kernels=decompose" } */ /* { dg-additional-options "-fopt-info-omp-all" } { dg-additional-options "-foffload=-fopt-info-omp-all" } */ /* { dg-additional-options "--param=openacc-privatization=noisy" } { dg-additional-options "-foffload=--param=openacc-privatization=noisy" } Prune a few: uninteresting: { dg-prune-output {note: variable 'D\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} } */ /* It's only with Tcl 8.5 (released in 2007) that "the variable 'varName' passed to 'incr' may be unset, and in that case, it will be set to [...]", so to maintain compatibility with earlier Tcl releases, we manually initialize counter variables: { dg-line l_dummy[variable c_compute 0] } { dg-message dummy {} { target iN-VAl-Id } l_dummy } to avoid "WARNING: dg-line var l_dummy defined, but not used". */ #undef NDEBUG #include #include #include #include /* Use explicit 'copyin' clauses, to work around "'firstprivate' optimizations", which will cause the value at the point of call to be used (*before* any potential modifications done in callbacks), as opposed to its address being taken, which then later gets dereferenced (*after* any modifications done in callbacks). */ #define COPYIN(...) copyin(__VA_ARGS__) /* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in 'libgomp.texi'. */ #define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0 #define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__) volatile // TODO PR90488 static int state = -1; #define STATE_OP(state, op) \ do \ { \ typeof (state) state_o = (state); \ (void) state_o; \ (state)op; \ DEBUG_printf("state: %d -> %d\n", state_o, (state)); \ } \ while (0) static acc_device_t acc_device_type; static int acc_device_num; static int num_gangs, num_workers, vector_length; static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) { DEBUG_printf ("%s\n", __FUNCTION__); assert (acc_device_type != acc_device_host); assert (state == 0); STATE_OP (state, = 1); assert (prof_info->event_type == acc_ev_enqueue_launch_start); assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); assert (prof_info->version == _ACC_PROF_INFO_VERSION); assert (prof_info->device_type == acc_device_type); assert (prof_info->device_number == acc_device_num); assert (prof_info->thread_id == -1); assert (prof_info->async == acc_async_noval); assert (prof_info->async_queue == prof_info->async); assert (prof_info->src_file == NULL); assert (prof_info->func_name == NULL); assert (prof_info->line_no == -1); assert (prof_info->end_line_no == -1); assert (prof_info->func_line_no == -1); assert (prof_info->func_end_line_no == -1); assert (event_info->launch_event.event_type == prof_info->event_type); assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES); assert (event_info->launch_event.parent_construct == acc_construct_parallel); assert (event_info->launch_event.implicit == 1); assert (event_info->launch_event.tool_info == NULL); assert (event_info->launch_event.kernel_name != NULL); { const char *s = strstr (event_info->launch_event.kernel_name, "main"); assert (s != NULL); s = strstr (s, "omp_fn"); assert (s != NULL); } if (num_gangs < 1) assert (event_info->launch_event.num_gangs >= 1); else { #ifdef __OPTIMIZE__ assert (event_info->launch_event.num_gangs == num_gangs); #else /* No parallelized OpenACC 'kernels' constructs. Unparallelized OpenACC 'kernels' constructs must get launched as 1 x 1 x 1 GPU kernels. */ assert (event_info->launch_event.num_gangs == 1); #endif } if (num_workers < 1) assert (event_info->launch_event.num_workers >= 1); else { #ifdef __OPTIMIZE__ assert (event_info->launch_event.num_workers == num_workers); #else /* See 'num_gangs' above. */ assert (event_info->launch_event.num_workers == 1); #endif } if (vector_length < 1) assert (event_info->launch_event.vector_length >= 1); else if (acc_device_type == acc_device_nvidia) /* ... is special. */ assert (event_info->launch_event.vector_length == 32); else if (acc_device_type == acc_device_radeon) /* ...and so is this. */ assert (event_info->launch_event.vector_length == 64); else { #ifdef __OPTIMIZE__ assert (event_info->launch_event.vector_length == vector_length); #else /* See 'num_gangs' above. */ assert (event_info->launch_event.vector_length == 1); #endif } if (acc_device_type == acc_device_host) assert (api_info->device_api == acc_device_api_none); else if (acc_device_type == acc_device_radeon) assert (api_info->device_api == acc_device_api_other); else assert (api_info->device_api == acc_device_api_cuda); assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); assert (api_info->device_type == prof_info->device_type); assert (api_info->vendor == -1); assert (api_info->device_handle == NULL); assert (api_info->context_handle == NULL); assert (api_info->async_handle == NULL); } static acc_prof_reg reg; static acc_prof_reg unreg; static acc_prof_lookup_func lookup; void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) { DEBUG_printf ("%s\n", __FUNCTION__); reg = reg_; unreg = unreg_; lookup = lookup_; } int main() { acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup); STATE_OP (state, = 0); reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg); assert (state == 0); acc_device_type = acc_get_device_type (); acc_device_num = acc_get_device_num (acc_device_type); assert (state == 0); /* Parallelism dimensions: compiler/runtime decides. */ STATE_OP (state, = 0); num_gangs = num_workers = vector_length = 0; { #define N 100 int x[N]; #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ /* { dg-note {OpenACC 'kernels' decomposition: variable 'i' declared in block requested to be made addressable} {} { target *-*-* } l_compute$c_compute } { dg-note {variable 'i' made addressable} {} { target *-*-* } l_compute$c_compute } */ /* { dg-note {variable 'i' declared in block is candidate for adjusting OpenACC privatization level} {} { target *-*-* } l_compute$c_compute } */ /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { ! __OPTIMIZE__ } } l_compute$c_compute } { dg-optimized {assigned OpenACC gang loop parallelism} {} { target __OPTIMIZE__ } l_compute$c_compute } */ { /* { dg-note {beginning 'parloops' part in OpenACC 'kernels' region} {} { target *-*-* } .+1 } */ for (int i = 0; i < N; ++i) x[i] = i * i; } if (acc_device_type == acc_device_host) assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */ else assert (state == 1); for (int i = 0; i < N; ++i) if (x[i] != i * i) __builtin_abort (); #undef N } /* Parallelism dimensions: literal. */ STATE_OP (state, = 0); num_gangs = 30; num_workers = 3; vector_length = 5; { #define N 100 int x[N]; #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \ num_gangs (30) num_workers (3) vector_length (5) /* { dg-note {OpenACC 'kernels' decomposition: variable 'i' declared in block requested to be made addressable} {} { target *-*-* } l_compute$c_compute } { dg-note {variable 'i' made addressable} {} { target *-*-* } l_compute$c_compute } */ /* { dg-note {variable 'i' declared in block is candidate for adjusting OpenACC privatization level} {} { target *-*-* } l_compute$c_compute } */ /* { dg-warning {using 'vector_length \(32\)', ignoring 5} {} { target { __OPTIMIZE__ && openacc_nvidia_accel_selected } } l_compute$c_compute } */ /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { ! __OPTIMIZE__ } } l_compute$c_compute } { dg-optimized {assigned OpenACC gang loop parallelism} {} { target __OPTIMIZE__ } l_compute$c_compute } */ { /* { dg-note {beginning 'parloops' part in OpenACC 'kernels' region} {} { target *-*-* } .+1 } */ for (int i = 0; i < N; ++i) x[i] = i * i; } if (acc_device_type == acc_device_host) assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */ else assert (state == 1); for (int i = 0; i < N; ++i) if (x[i] != i * i) __builtin_abort (); #undef N } /* Parallelism dimensions: variable. */ STATE_OP (state, = 0); num_gangs = 22; num_workers = 5; vector_length = 7; { #define N 100 int x[N]; #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \ num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length) /* { dg-note {OpenACC 'kernels' decomposition: variable 'i' declared in block requested to be made addressable} {} { target *-*-* } l_compute$c_compute } { dg-note {variable 'i' made addressable} {} { target *-*-* } l_compute$c_compute } */ /* { dg-note {variable 'i' declared in block is candidate for adjusting OpenACC privatization level} {} { target *-*-* } l_compute$c_compute } */ /* { dg-warning {using 'vector_length \(32\)', ignoring runtime setting} {} { target { __OPTIMIZE__ && openacc_nvidia_accel_selected } } l_compute$c_compute } */ /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { ! __OPTIMIZE__ } } l_compute$c_compute } { dg-optimized {assigned OpenACC gang loop parallelism} {} { target __OPTIMIZE__ } l_compute$c_compute } */ { /* { dg-note {beginning 'parloops' part in OpenACC 'kernels' region} {} { target *-*-* } .+1 } */ for (int i = 0; i < N; ++i) x[i] = i * i; } if (acc_device_type == acc_device_host) assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */ else assert (state == 1); for (int i = 0; i < N; ++i) if (x[i] != i * i) __builtin_abort (); #undef N } return 0; }