diff options
Diffstat (limited to 'libcilkrts/runtime/cilk-abi-cilk-for.cpp')
-rw-r--r-- | libcilkrts/runtime/cilk-abi-cilk-for.cpp | 406 |
1 files changed, 406 insertions, 0 deletions
diff --git a/libcilkrts/runtime/cilk-abi-cilk-for.cpp b/libcilkrts/runtime/cilk-abi-cilk-for.cpp new file mode 100644 index 0000000..4fa6dce --- /dev/null +++ b/libcilkrts/runtime/cilk-abi-cilk-for.cpp @@ -0,0 +1,406 @@ +/* cilk-abi-cilk-for.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2011, 2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/* Implementation of cilk_for ABI. + * + * This file must be C++, not C, in order to handle C++ exceptions correctly + * from within the body of the cilk_for loop + */ + +#include "internal/abi.h" +#include "metacall_impl.h" +#include "global_state.h" + +// Icky macros to determine if we're compiled with optimization. Based on +// the declaration of __CILKRTS_ASSERT in common.h +#if defined(_WIN32) +# if defined (_DEBUG) +# define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od +# else +# define CILKRTS_OPTIMIZED 1 +# endif // defined(_DEBUG) +#else +# if defined(__OPTIMIZE__) +# define CILKRTS_OPTIMIZED 1 +# else +# define CILKRTS_OPTIMIZED 0 +# endif +#endif + +template <typename count_t> +static inline int grainsize(int req, count_t count) +{ + // A positive requested grain size comes from the user. A very high grain + // size risks losing parallelism, but the user told us what they want for + // grainsize. Who are we to argue? + if (req > 0) + return req; + + // At present, a negative requested grain size is treated the same way as + // a zero grain size, i.e., the runtime computes the actual grainsize + // using a hueristic. In the future, the compiler may give us additional + // information about the size of the cilk_for body by passing a negative + // grain size. + + // Avoid generating a zero grainsize, even for empty loops. + if (count < 1) + return 1; + + global_state_t* g = cilkg_get_global_state(); + if (g->under_ptool) + { + // Grainsize = 1, when running under PIN, and when the grainsize has + // not explicitly been set by the user. + return 1; + } + else + { + // Divide loop count by 8 times the worker count and round up. + const int Px8 = g->P * 8; + count_t n = (count + Px8 - 1) / Px8; + + // 2K should be enough to amortize the cost of the cilk_for. Any + // larger grainsize risks losing parallelism. + if (n > 2048) + return 2048; + return (int) n; // n <= 2048, so no loss of precision on cast to int + } +} + +/* + * call_cilk_for_loop_body + * + * Centralizes the code to call the loop body. The compiler should be + * inlining this code + * + * low - Low loop index we're considering in this portion of the algorithm + * high - High loop index we're considering in this portion of the algorithm + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * w - __cilkrts_worker we're currently executing on + * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of + * the cilk_for loop to flatten out the internal nodes + */ +template <typename count_t, typename F> +inline static +void call_cilk_for_loop_body(count_t low, count_t high, + F body, void *data, + __cilkrts_worker *w, + __cilkrts_pedigree *loop_root_pedigree) +{ + // Cilkscreen should not report this call in a stack trace + NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); + + // The worker is only valid until the first spawn. Fetch the + // __cilkrts_stack_frame out of the worker, since it will be stable across + // steals. The sf pointer actually points to the *parent's* + // __cilkrts_stack_frame, since this function is a non-spawning function + // and therefore has no cilk stack frame of its own. + __cilkrts_stack_frame *sf = w->current_stack_frame; + + // Save the pedigree node pointed to by the worker. We'll need to restore + // that when we exit since the spawn helpers in the cilk_for call tree + // will assume that it's valid + const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent; + + // Add the leaf pedigree node to the chain. The parent is the root node + // to flatten the tree regardless of the DAG branches in the cilk_for + // divide-and-conquer recursion. + // + // The rank is initialized to the low index. The user is + // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for + // loop body. + __cilkrts_pedigree loop_leaf_pedigree; + + loop_leaf_pedigree.rank = (uint64_t)low; + loop_leaf_pedigree.parent = loop_root_pedigree; + + // The worker's pedigree always starts with a rank of 0 + w->pedigree.rank = 0; + w->pedigree.parent = &loop_leaf_pedigree; + + // Call the compiler generated cilk_for loop body lambda function + body(data, low, high); + + // The loop body may have included spawns, so we must refetch the worker + // from the __cilkrts_stack_frame, which is stable regardless of which + // worker we're executing on. + w = sf->worker; + + // Restore the pedigree chain. It must be valid because the spawn helpers + // generated by the cilk_for implementation will access it. + w->pedigree.parent = saved_next_pedigree_node; +} + +/* capture_spawn_arg_stack_frame + * + * Efficiently get the address of the caller's __cilkrts_stack_frame. The + * preconditons are that 'w' is the worker at the time of the call and + * 'w->current_stack_frame' points to the __cilkrts_stack_frame within the + * spawn helper. This function should be called only within the argument list + * of a function that is being spawned because that is the only situation in + * which these preconditions hold. This function returns the worker + * (unchanged) after storing the captured stack frame pointer is stored in the + * sf argument. + * + * The purpose of this function is to get the caller's stack frame in a + * context where the caller's worker is known but its stack frame is not + * necessarily initialized. The "shrink wrap" optimization delays + * initializing the contents of a spawning function's '__cilkrts_stack_frame' + * as well as the 'current_stack_frame' pointer within the worker. By calling + * this function within a spawning function's argument list, we can ensure + * that these initializations have occured but that a detach (which would + * invalidate the worker pointer in the caller) has not yet occured. Once the + * '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the + * remainder of the caller's execution, and becomes an efficient way to get + * the worker (much more efficient than calling '__cilkrts_get_tls_worker()'), + * even after a spawn or sync. + */ +inline __cilkrts_worker* +capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w) +{ + // Get current stack frame + sf = w->current_stack_frame; +#ifdef __INTEL_COMPILER +# if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101 + // In older compilers 'w->current_stack_frame' points to the + // spawn-helper's stack frame. In newer compiler's however, it points + // directly to the pointer's stack frame. (This change was made to avoid + // having the spawn helper in the frame list when evaluating function + // arguments, thus avoiding corruption when those arguments themselves + // contain cilk_spawns.) + + // w->current_stack_frame is the spawn helper's stack frame. + // w->current_stack_frame->call_parent is the caller's stack frame. + sf = sf->call_parent; +# endif +#endif + return w; +} + +/* + * cilk_for_recursive + * + * Templatized function to implement the recursive divide-and-conquer + * algorithm that's how we implement a cilk_for. + * + * low - Low loop index we're considering in this portion of the algorithm + * high - High loop index we're considering in this portion of the algorithm + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * grain - grain size (0 if it should be computed) + * w - __cilkrts_worker we're currently executing on + * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of + * the cilk_for loop to flatten out the internal nodes + */ +template <typename count_t, typename F> +static +void cilk_for_recursive(count_t low, count_t high, + F body, void *data, int grain, + __cilkrts_worker *w, + __cilkrts_pedigree *loop_root_pedigree) +{ +tail_recurse: + // Cilkscreen should not report this call in a stack trace + // This needs to be done everytime the worker resumes + NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); + + count_t count = high - low; + // Invariant: count > 0, grain >= 1 + if (count > grain) + { + // Invariant: count >= 2 + count_t mid = low + count / 2; + // The worker is valid only until the first spawn and is expensive to + // retrieve (using '__cilkrts_get_tls_worker') after the spawn. The + // '__cilkrts_stack_frame' is more stable, but isn't initialized until + // the first spawn. Thus, we want to grab the address of the + // '__cilkrts_stack_frame' after it is initialized but before the + // spawn detaches. The only place we can do that is within the + // argument list of the spawned function, hence the call to + // capture_spawn_arg_stack_frame(). + __cilkrts_stack_frame *sf; + _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, + capture_spawn_arg_stack_frame(sf, w), + loop_root_pedigree); + w = sf->worker; + low = mid; + + goto tail_recurse; + } + + // Call the cilk_for loop body lambda function passed in by the compiler to + // execute one grain + call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree); +} + +static void noop() { } + +/* + * cilk_for_root + * + * Templatized function to implement the top level of a cilk_for loop. + * + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * count - trip count for loop + * grain - grain size (0 if it should be computed) + */ +template <typename count_t, typename F> +static void cilk_for_root(F body, void *data, count_t count, int grain) +{ + // Cilkscreen should not report this call in a stack trace + NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); + + // Pedigree computation: + // + // If the last pedigree node on entry to the _Cilk_for has value X, + // then at the start of each iteration of the loop body, the value of + // the last pedigree node should be 0, the value of the second-to-last + // node should equal the loop counter, and the value of the + // third-to-last node should be X. On return from the _Cilk_for, the + // value of the last pedigree should be incremented to X+2. The + // pedigree within the loop is thus flattened, such that the depth of + // recursion does not affect the results either inside or outside of + // the loop. Note that the pedigree after the loop exists is the same + // as if a single spawn and sync were executed within this function. + + // TBD: Since the shrink-wrap optimization was turned on in the compiler, + // it is not possible to get the current stack frame without actually + // forcing a call to bind-thread. This spurious spawn is a temporary + // stopgap until the correct intrinsics are added to give us total control + // over frame initialization. + _Cilk_spawn noop(); + + // Fetch the current worker. From that we can get the current stack frame + // which will be constant even if we're stolen + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + __cilkrts_stack_frame *sf = w->current_stack_frame; + + // Decrement the rank by one to undo the pedigree change from the + // _Cilk_spawn + --w->pedigree.rank; + + // Save the current worker pedigree into loop_root_pedigree, which will be + // the root node for our flattened pedigree. + __cilkrts_pedigree loop_root_pedigree = w->pedigree; + + // Don't splice the loop_root node in yet. It will be done when we + // call the loop body lambda function +// w->pedigree.rank = 0; +// w->pedigree.next = &loop_root_pedigree; + + /* Spawn is necessary at top-level to force runtime to start up. + * Runtime must be started in order to call the grainsize() function. + */ + int gs = grainsize(grain, count); + cilk_for_recursive((count_t) 0, count, body, data, gs, w, + &loop_root_pedigree); + + // Need to refetch the worker after calling a spawning function. + w = sf->worker; + + // Restore the pedigree in the worker. + w->pedigree = loop_root_pedigree; + + // Bump the worker pedigree. + ++w->pedigree.rank; + + // Implicit sync will increment the pedigree leaf rank again, for a total + // of two increments. If the noop spawn above is removed, then we'll need + // to re-enable the following code: +// // If this is an optimized build, then the compiler will have optimized +// // out the increment of the worker's pedigree in the implied sync. We +// // need to add one to make the pedigree_loop test work correctly. +// #if CILKRTS_OPTIMIZED +// ++sf->worker->pedigree.rank; +// #endif +} + +// Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and +// __cilkrts_cilk_for_64. +extern "C" { + +/* + * __cilkrts_cilk_for_32 + * + * Implementation of cilk_for for 32-bit trip counts (regardless of processor + * word size). Assumes that the range is 0 - count. + * + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * count - trip count for loop + * grain - grain size (0 if it should be computed) + */ + +CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data, + cilk32_t count, int grain) +{ + // Cilkscreen should not report this call in a stack trace + NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); + + // Check for an empty range here as an optimization - don't need to do any + // __cilkrts_stack_frame initialization + if (count > 0) + cilk_for_root(body, data, count, grain); +} + +/* + * __cilkrts_cilk_for_64 + * + * Implementation of cilk_for for 64-bit trip counts (regardless of processor + * word size). Assumes that the range is 0 - count. + * + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * count - trip count for loop + * grain - grain size (0 if it should be computed) + */ +CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data, + cilk64_t count, int grain) +{ + // Check for an empty range here as an optimization - don't need to do any + // __cilkrts_stack_frame initialization + if (count > 0) + cilk_for_root(body, data, count, grain); +} + +} // end extern "C" + +/* End cilk-abi-cilk-for.cpp */ |