diff options
author | Nathan Sidwell <nathan@codesourcery.com> | 2015-11-18 13:49:17 +0000 |
---|---|---|
committer | Nathan Sidwell <nathan@gcc.gnu.org> | 2015-11-18 13:49:17 +0000 |
commit | 33f47f427920586a6f21dd8f1b1adc582b9fb7af (patch) | |
tree | 91925ad53ac42a3a444905c3ffb894e6ec7c3a18 /gcc | |
parent | d085c468179245fdd31c1014d3029ddd9e116e01 (diff) | |
download | gcc-33f47f427920586a6f21dd8f1b1adc582b9fb7af.zip gcc-33f47f427920586a6f21dd8f1b1adc582b9fb7af.tar.gz gcc-33f47f427920586a6f21dd8f1b1adc582b9fb7af.tar.bz2 |
nvptx.c (global_lock_var): New.
gcc/
* config/nvptx/nvptx.c (global_lock_var): New.
(nvptx_global_lock_addr): New.
(nvptx_lockless_update): Recomment and adjust for clarity.
(nvptx_lockfull_update): New.
(nvptx_reduction_update): New.
(nvptx_goacc_reduction_fini): Call it.
libgcc/
* config/nvptx/reduction.c: New.
* config/nvptx/t-nvptx (LIB2ADD): Add it.
libgomp/
* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add
worker & gang cases.
* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise.
From-SVN: r230545
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 9 | ||||
-rw-r--r-- | gcc/config/nvptx/nvptx.c | 240 |
2 files changed, 210 insertions, 39 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index abf01d3..d569816 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2015-11-18 Nathan Sidwell <nathan@codesourcery.com> + + * config/nvptx/nvptx.c (global_lock_var): New. + (nvptx_global_lock_addr): New. + (nvptx_lockless_update): Recomment and adjust for clarity. + (nvptx_lockfull_update): New. + (nvptx_reduction_update): New. + (nvptx_goacc_reduction_fini): Call it. + 2015-11-18 Bernd Schmidt <bschmidt@redhat.com> * regrename.h (struct du_head): Add target_data_1 and target_data_2 diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 26c2e96..4436ac4 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -114,6 +114,9 @@ static unsigned worker_red_align; #define worker_red_name "__worker_red" static GTY(()) rtx worker_red_sym; +/* Global lock variable, needed for 128bit worker & gang reductions. */ +static GTY(()) tree global_lock_var; + /* Allocate a new, cleared machine_function structure. */ static struct machine_function * @@ -3681,8 +3684,45 @@ nvptx_generate_vector_shuffle (location_t loc, gimplify_assign (dest_var, expr, seq); } -/* Insert code to locklessly update *PTR with *PTR OP VAR just before - GSI. */ +/* Lazily generate the global lock var decl and return its address. */ + +static tree +nvptx_global_lock_addr () +{ + tree v = global_lock_var; + + if (!v) + { + tree name = get_identifier ("__reduction_lock"); + tree type = build_qualified_type (unsigned_type_node, + TYPE_QUAL_VOLATILE); + v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type); + global_lock_var = v; + DECL_ARTIFICIAL (v) = 1; + DECL_EXTERNAL (v) = 1; + TREE_STATIC (v) = 1; + TREE_PUBLIC (v) = 1; + TREE_USED (v) = 1; + mark_addressable (v); + mark_decl_referenced (v); + } + + return build_fold_addr_expr (v); +} + +/* Insert code to locklessly update *PTR with *PTR OP VAR just before + GSI. We use a lockless scheme for nearly all case, which looks + like: + actual = initval(OP); + do { + guess = actual; + write = guess OP myval; + actual = cmp&swap (ptr, guess, write) + } while (actual bit-different-to guess); + return write; + + This relies on a cmp&swap instruction, which is available for 32- + and 64-bit types. Larger types must use a locking scheme. */ static tree nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, @@ -3690,46 +3730,30 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, { unsigned fn = NVPTX_BUILTIN_CMP_SWAP; tree_code code = NOP_EXPR; - tree type = unsigned_type_node; - - enum machine_mode mode = TYPE_MODE (TREE_TYPE (var)); + tree arg_type = unsigned_type_node; + tree var_type = TREE_TYPE (var); - if (!INTEGRAL_MODE_P (mode)) + if (TREE_CODE (var_type) == COMPLEX_TYPE + || TREE_CODE (var_type) == REAL_TYPE) code = VIEW_CONVERT_EXPR; - if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode)) + + if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node)) { + arg_type = long_long_unsigned_type_node; fn = NVPTX_BUILTIN_CMP_SWAPLL; - type = long_long_unsigned_type_node; } + tree swap_fn = nvptx_builtin_decl (fn, true); + gimple_seq init_seq = NULL; - tree init_var = make_ssa_name (type); - tree init_expr = omp_reduction_init_op (loc, op, TREE_TYPE (var)); - init_expr = fold_build1 (code, type, init_expr); + tree init_var = make_ssa_name (arg_type); + tree init_expr = omp_reduction_init_op (loc, op, var_type); + init_expr = fold_build1 (code, arg_type, init_expr); gimplify_assign (init_var, init_expr, &init_seq); gimple *init_end = gimple_seq_last (init_seq); gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT); - gimple_seq loop_seq = NULL; - tree expect_var = make_ssa_name (type); - tree actual_var = make_ssa_name (type); - tree write_var = make_ssa_name (type); - - tree write_expr = fold_build1 (code, TREE_TYPE (var), expect_var); - write_expr = fold_build2 (op, TREE_TYPE (var), write_expr, var); - write_expr = fold_build1 (code, type, write_expr); - gimplify_assign (write_var, write_expr, &loop_seq); - - tree swap_expr = nvptx_builtin_decl (fn, true); - swap_expr = build_call_expr_loc (loc, swap_expr, 3, - ptr, expect_var, write_var); - gimplify_assign (actual_var, swap_expr, &loop_seq); - - gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, - NULL_TREE, NULL_TREE); - gimple_seq_add_stmt (&loop_seq, cond); - /* Split the block just after the init stmts. */ basic_block pre_bb = gsi_bb (*gsi); edge pre_edge = split_block (pre_bb, init_end); @@ -3738,12 +3762,34 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, /* Reset the iterator. */ *gsi = gsi_for_stmt (gsi_stmt (*gsi)); - /* Insert the loop statements. */ - gimple *loop_end = gimple_seq_last (loop_seq); - gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT); + tree expect_var = make_ssa_name (arg_type); + tree actual_var = make_ssa_name (arg_type); + tree write_var = make_ssa_name (arg_type); + + /* Build and insert the reduction calculation. */ + gimple_seq red_seq = NULL; + tree write_expr = fold_build1 (code, var_type, expect_var); + write_expr = fold_build2 (op, var_type, write_expr, var); + write_expr = fold_build1 (code, arg_type, write_expr); + gimplify_assign (write_var, write_expr, &red_seq); + + gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); + + /* Build & insert the cmp&swap sequence. */ + gimple_seq latch_seq = NULL; + tree swap_expr = build_call_expr_loc (loc, swap_fn, 3, + ptr, expect_var, write_var); + gimplify_assign (actual_var, swap_expr, &latch_seq); + + gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&latch_seq, cond); + + gimple *latch_end = gimple_seq_last (latch_seq); + gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT); - /* Split the block just after the loop stmts. */ - edge post_edge = split_block (loop_bb, loop_end); + /* Split the block just after the latch stmts. */ + edge post_edge = split_block (loop_bb, latch_end); basic_block post_bb = post_edge->dest; loop_bb = post_edge->src; *gsi = gsi_for_stmt (gsi_stmt (*gsi)); @@ -3762,7 +3808,123 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, loop->latch = loop_bb; add_loop (loop, loop_bb->loop_father); - return fold_build1 (code, TREE_TYPE (var), write_var); + return fold_build1 (code, var_type, write_var); +} + +/* Insert code to lockfully update *PTR with *PTR OP VAR just before + GSI. This is necessary for types larger than 64 bits, where there + is no cmp&swap instruction to implement a lockless scheme. We use + a lock variable in global memory. + + while (cmp&swap (&lock_var, 0, 1)) + continue; + T accum = *ptr; + accum = accum OP var; + *ptr = accum; + cmp&swap (&lock_var, 1, 0); + return accum; + + A lock in global memory is necessary to force execution engine + descheduling and avoid resource starvation that can occur if the + lock is in .shared memory. */ + +static tree +nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, + tree ptr, tree var, tree_code op) +{ + tree var_type = TREE_TYPE (var); + tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true); + tree uns_unlocked = build_int_cst (unsigned_type_node, 0); + tree uns_locked = build_int_cst (unsigned_type_node, 1); + + /* Split the block just before the gsi. Insert a gimple nop to make + this easier. */ + gimple *nop = gimple_build_nop (); + gsi_insert_before (gsi, nop, GSI_SAME_STMT); + basic_block entry_bb = gsi_bb (*gsi); + edge entry_edge = split_block (entry_bb, nop); + basic_block lock_bb = entry_edge->dest; + /* Reset the iterator. */ + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + /* Build and insert the locking sequence. */ + gimple_seq lock_seq = NULL; + tree lock_var = make_ssa_name (unsigned_type_node); + tree lock_expr = nvptx_global_lock_addr (); + lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr, + uns_unlocked, uns_locked); + gimplify_assign (lock_var, lock_expr, &lock_seq); + gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&lock_seq, cond); + gimple *lock_end = gimple_seq_last (lock_seq); + gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT); + + /* Split the block just after the lock sequence. */ + edge locked_edge = split_block (lock_bb, lock_end); + basic_block update_bb = locked_edge->dest; + lock_bb = locked_edge->src; + *gsi = gsi_for_stmt (gsi_stmt (*gsi)); + + /* Create the lock loop ... */ + locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; + make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE); + set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb); + set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb); + + /* ... and the loop structure. */ + loop *lock_loop = alloc_loop (); + lock_loop->header = lock_bb; + lock_loop->latch = lock_bb; + lock_loop->nb_iterations_estimate = 1; + lock_loop->any_estimate = true; + add_loop (lock_loop, entry_bb->loop_father); + + /* Build and insert the reduction calculation. */ + gimple_seq red_seq = NULL; + tree acc_in = make_ssa_name (var_type); + tree ref_in = build_simple_mem_ref (ptr); + TREE_THIS_VOLATILE (ref_in) = 1; + gimplify_assign (acc_in, ref_in, &red_seq); + + tree acc_out = make_ssa_name (var_type); + tree update_expr = fold_build2 (op, var_type, ref_in, var); + gimplify_assign (acc_out, update_expr, &red_seq); + + tree ref_out = build_simple_mem_ref (ptr); + TREE_THIS_VOLATILE (ref_out) = 1; + gimplify_assign (ref_out, acc_out, &red_seq); + + gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); + + /* Build & insert the unlock sequence. */ + gimple_seq unlock_seq = NULL; + tree unlock_expr = nvptx_global_lock_addr (); + unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr, + uns_locked, uns_unlocked); + gimplify_and_add (unlock_expr, &unlock_seq); + gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT); + + return acc_out; +} + +/* Emit a sequence to update a reduction accumlator at *PTR with the + value held in VAR using operator OP. Return the updated value. + + TODO: optimize for atomic ops and indepedent complex ops. */ + +static tree +nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, + tree ptr, tree var, tree_code op) +{ + tree type = TREE_TYPE (var); + tree size = TYPE_SIZE (type); + + if (size == TYPE_SIZE (unsigned_type_node) + || size == TYPE_SIZE (long_long_unsigned_type_node)) + return nvptx_lockless_update (loc, gsi, ptr, var, op); + else + return nvptx_lockfull_update (loc, gsi, ptr, var, op); } /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ @@ -3944,11 +4106,11 @@ nvptx_goacc_reduction_fini (gcall *call) if (accum) { - /* Locklessly update the accumulator. */ + /* UPDATE the accumulator. */ gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); seq = NULL; - r = nvptx_lockless_update (gimple_location (call), &gsi, - accum, var, op); + r = nvptx_reduction_update (gimple_location (call), &gsi, + accum, var, op); } } |