diff options
Diffstat (limited to 'openmp/runtime/src/kmp_collapse.cpp')
-rw-r--r-- | openmp/runtime/src/kmp_collapse.cpp | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp index 2c410ca..569d2c1 100644 --- a/openmp/runtime/src/kmp_collapse.cpp +++ b/openmp/runtime/src/kmp_collapse.cpp @@ -1272,6 +1272,304 @@ void kmp_calc_original_ivs_for_end( } } +/************************************************************************** + * Identify nested loop structure - loops come in the canonical form + * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0} + * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1} + * Upper Triangle matrix + * i = 0; i <= N; i++ {0,0}:{N,0} + * j = 0+1*i; j <= N; j++ {0,1}:{N,0} + * ************************************************************************/ +nested_loop_type_t +kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest, + /*in*/ kmp_index_t n) { + // only 2-level nested loops are supported + if (n != 2) { + return nested_loop_type_unkown; + } + // loops must be canonical + KMP_ASSERT( + (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) && + (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq)); + // check outer loop bounds: for triangular need to be {0,0}:{N,0} + kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].lb0_u64); + kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].ub0_u64); + kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].lb1_u64); + kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].ub1_u64); + if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) { + return nested_loop_type_unkown; + } + // check inner bounds to determine triangle type + kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].lb0_u64); + kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].ub0_u64); + kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].lb1_u64); + kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].ub1_u64); + // lower triangle loop inner bounds need to be {0,0}:{0/-1,1} + if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 && + (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) { + return nested_loop_type_lower_triangular_matrix; + } + // upper triangle loop inner bounds need to be {0,1}:{N,0} + if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 && + inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) { + return nested_loop_type_upper_triangular_matrix; + } + return nested_loop_type_unkown; +} + +/************************************************************************** + * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf + * Start point is x so the result is always > sqrt(x) + * The method has uniform convergence, PRECISION is set to 0.1 + * ************************************************************************/ +#define level_of_precision 0.1 +double sqrt_newton_approx(/*in*/ kmp_uint64 x) { + double sqrt_old = 0.; + double sqrt_new = (double)x; + do { + sqrt_old = sqrt_new; + sqrt_new = (sqrt_old + x / sqrt_old) / 2; + } while ((sqrt_old - sqrt_new) > level_of_precision); + return sqrt_new; +} + +/************************************************************************** + * Handle lower triangle matrix in the canonical form + * i = 0; i <= N; i++ {0,0}:{N,0} + * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1} + * ************************************************************************/ +void kmp_handle_lower_triangle_matrix( + /*in*/ kmp_uint32 nth, + /*in*/ kmp_uint32 tid, + /*in */ kmp_index_t n, + /*in/out*/ bounds_info_t *original_bounds_nest, + /*out*/ bounds_info_t *chunk_bounds_nest) { + + // transfer loop types from the original loop to the chunks + for (kmp_index_t i = 0; i < n; ++i) { + chunk_bounds_nest[i] = original_bounds_nest[i]; + } + // cleanup iv variables + kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].ub0_u64); + kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].lb0_u64); + kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].ub0_u64); + // calculate the chunk's lower and upper bounds + // the total number of iterations in the loop is the sum of the arithmetic + // progression from the outer lower to outer upper bound (inclusive since the + // loop is canonical) note that less_than inner loops (inner_ub0 = -1) + // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 + // + 1) -> N - 1 + kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0; + kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; + // the current thread's number of iterations: + // each thread gets an equal number of iterations: total number of iterations + // divided by the number of threads plus, if there's a remainder, + // the first threads with the number up to the remainder get an additional + // iteration each to cover it + kmp_uint64 iter_current = + iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); + // cumulative number of iterations executed by all the previous threads: + // threads with the tid below the remainder will have (iter_total/nth+1) + // elements, and so will all threads before them so the cumulative number of + // iterations executed by the all previous will be the current thread's number + // of iterations multiplied by the number of previous threads which is equal + // to the current thread's tid; threads with the number equal or above the + // remainder will have (iter_total/nth) elements so the cumulative number of + // iterations previously executed is its number of iterations multipled by the + // number of previous threads which is again equal to the current thread's tid + // PLUS all the remainder iterations that will have been executed by the + // previous threads + kmp_uint64 iter_before_current = + tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); + // cumulative number of iterations executed with the current thread is + // the cumulative number executed before it plus its own + kmp_uint64 iter_with_current = iter_before_current + iter_current; + // calculate the outer loop lower bound (lbo) which is the max outer iv value + // that gives the number of iterations that is equal or just below the total + // number of iterations executed by the previous threads, for less_than + // (1-based) inner loops (inner_ub0 == -1) it will be i.e. + // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0 + // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be: + // i.e. lbo*(lbo+1)/2<=iter_before_current => + // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily + // using a parameter to control the equation sign + kmp_int64 inner_adjustment = 1 + 2 * inner_ub0; + kmp_uint64 lower_bound_outer = + (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + + 8 * iter_before_current) + + inner_adjustment) / + 2 - + inner_adjustment; + // calculate the inner loop lower bound which is the remaining number of + // iterations required to hit the total number of iterations executed by the + // previous threads giving the starting point of this thread + kmp_uint64 lower_bound_inner = + iter_before_current - + ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2; + // calculate the outer loop upper bound using the same approach as for the + // inner bound except using the total number of iterations executed with the + // current thread + kmp_uint64 upper_bound_outer = + (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment + + 8 * iter_with_current) + + inner_adjustment) / + 2 - + inner_adjustment; + // calculate the inner loop upper bound which is the remaining number of + // iterations required to hit the total number of iterations executed after + // the current thread giving the starting point of the next thread + kmp_uint64 upper_bound_inner = + iter_with_current - + ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2; + // adjust the upper bounds down by 1 element to point at the last iteration of + // the current thread the first iteration of the next thread + if (upper_bound_inner == 0) { + // {n,0} => {n-1,n-1} + upper_bound_outer -= 1; + upper_bound_inner = upper_bound_outer; + } else { + // {n,m} => {n,m-1} (m!=0) + upper_bound_inner -= 1; + } + + // assign the values, zeroing out lb1 and ub1 values since the iteration space + // is now one-dimensional + chunk_bounds_nest[0].lb0_u64 = lower_bound_outer; + chunk_bounds_nest[1].lb0_u64 = lower_bound_inner; + chunk_bounds_nest[0].ub0_u64 = upper_bound_outer; + chunk_bounds_nest[1].ub0_u64 = upper_bound_inner; + chunk_bounds_nest[0].lb1_u64 = 0; + chunk_bounds_nest[0].ub1_u64 = 0; + chunk_bounds_nest[1].lb1_u64 = 0; + chunk_bounds_nest[1].ub1_u64 = 0; + +#if 0 + printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", + tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, + chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); +#endif +} + +/************************************************************************** + * Handle upper triangle matrix in the canonical form + * i = 0; i <= N; i++ {0,0}:{N,0} + * j = 0+1*i; j <= N; j++ {0,1}:{N,0} + * ************************************************************************/ +void kmp_handle_upper_triangle_matrix( + /*in*/ kmp_uint32 nth, + /*in*/ kmp_uint32 tid, + /*in */ kmp_index_t n, + /*in/out*/ bounds_info_t *original_bounds_nest, + /*out*/ bounds_info_t *chunk_bounds_nest) { + + // transfer loop types from the original loop to the chunks + for (kmp_index_t i = 0; i < n; ++i) { + chunk_bounds_nest[i] = original_bounds_nest[i]; + } + // cleanup iv variables + kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].ub0_u64); + kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, + original_bounds_nest[0].lb0_u64); + kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, + original_bounds_nest[1].ub0_u64); + // calculate the chunk's lower and upper bounds + // the total number of iterations in the loop is the sum of the arithmetic + // progression from the outer lower to outer upper bound (inclusive since the + // loop is canonical) note that less_than inner loops (inner_ub0 = -1) + // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0 + // + 1) -> N - 1 + kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1); + kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2; + // the current thread's number of iterations: + // each thread gets an equal number of iterations: total number of iterations + // divided by the number of threads plus, if there's a remainder, + // the first threads with the number up to the remainder get an additional + // iteration each to cover it + kmp_uint64 iter_current = + iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0); + // cumulative number of iterations executed by all the previous threads: + // threads with the tid below the remainder will have (iter_total/nth+1) + // elements, and so will all threads before them so the cumulative number of + // iterations executed by the all previous will be the current thread's number + // of iterations multiplied by the number of previous threads which is equal + // to the current thread's tid; threads with the number equal or above the + // remainder will have (iter_total/nth) elements so the cumulative number of + // iterations previously executed is its number of iterations multipled by the + // number of previous threads which is again equal to the current thread's tid + // PLUS all the remainder iterations that will have been executed by the + // previous threads + kmp_uint64 iter_before_current = + tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth)); + // cumulative number of iterations executed with the current thread is + // the cumulative number executed before it plus its own + kmp_uint64 iter_with_current = iter_before_current + iter_current; + // calculate the outer loop lower bound (lbo) which is the max outer iv value + // that gives the number of iterations that is equal or just below the total + // number of iterations executed by the previous threads, for less_than + // (1-based) inner loops (inner_ub0 == -1) it will be i.e. + // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0 + // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be: + // i.e. lbo*(lbo+1)/2<=iter_before_current => + // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily + // using a parameter to control the equatio sign + kmp_uint64 lower_bound_outer = + (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1; + ; + // calculate the inner loop lower bound which is the remaining number of + // iterations required to hit the total number of iterations executed by the + // previous threads giving the starting point of this thread + kmp_uint64 lower_bound_inner = + iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2; + // calculate the outer loop upper bound using the same approach as for the + // inner bound except using the total number of iterations executed with the + // current thread + kmp_uint64 upper_bound_outer = + (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1; + // calculate the inner loop upper bound which is the remaining number of + // iterations required to hit the total number of iterations executed after + // the current thread giving the starting point of the next thread + kmp_uint64 upper_bound_inner = + iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2; + // adjust the upper bounds down by 1 element to point at the last iteration of + // the current thread the first iteration of the next thread + if (upper_bound_inner == 0) { + // {n,0} => {n-1,n-1} + upper_bound_outer -= 1; + upper_bound_inner = upper_bound_outer; + } else { + // {n,m} => {n,m-1} (m!=0) + upper_bound_inner -= 1; + } + + // assign the values, zeroing out lb1 and ub1 values since the iteration space + // is now one-dimensional + chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer; + chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner; + chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer; + chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner; + chunk_bounds_nest[0].lb1_u64 = 0; + chunk_bounds_nest[0].ub1_u64 = 0; + chunk_bounds_nest[1].lb1_u64 = 0; + chunk_bounds_nest[1].ub1_u64 = 0; + +#if 0 + printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n", + tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64, + chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total); +#endif +} //----------Init API for non-rectangular loops-------------------------------- // Init API for collapsed loops (static, no chunks defined). @@ -1334,6 +1632,19 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, KMP_DEBUG_ASSERT(tid < nth); + // Handle special cases + nested_loop_type_t loop_type = + kmp_identify_nested_loop_structure(original_bounds_nest, n); + if (loop_type == nested_loop_type_lower_triangular_matrix) { + kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest, + chunk_bounds_nest); + return TRUE; + } else if (loop_type == nested_loop_type_upper_triangular_matrix) { + kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest, + chunk_bounds_nest); + return TRUE; + } + CollapseAllocator<kmp_uint64> original_ivs_start(n); if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, |