diff options
author | Alexander Monakov <amonakov@ispras.ru> | 2017-03-28 20:24:57 +0300 |
---|---|---|
committer | Alexander Monakov <amonakov@gcc.gnu.org> | 2017-03-28 20:24:57 +0300 |
commit | 0c6b03b5158f53a3c7042cf8625aa5e6bc74f52b (patch) | |
tree | 6e27cae40470f82adccad608f14526f11d8fbc1a /gcc/config | |
parent | cf474530613eaaa4d28534a5a53ef61fcc71180d (diff) | |
download | gcc-0c6b03b5158f53a3c7042cf8625aa5e6bc74f52b.zip gcc-0c6b03b5158f53a3c7042cf8625aa5e6bc74f52b.tar.gz gcc-0c6b03b5158f53a3c7042cf8625aa5e6bc74f52b.tar.bz2 |
OpenMP/PTX privatization in SIMD regions
* config/nvptx/nvptx-protos.h (nvptx_output_simt_enter): Declare.
(nvptx_output_simt_exit): Declare.
* config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Use
cfun->machine->unisimt_location. Handle NULL unisimt_predicate.
(init_softstack_frame): Move initialization of crtl->is_leaf to...
(nvptx_declare_function_name): ...here. Emit declaration of local
memory space buffer for omp_simt_enter insn.
(nvptx_output_unisimt_switch): New.
(nvptx_output_softstack_switch): New.
(nvptx_output_simt_enter): New.
(nvptx_output_simt_exit): New.
* config/nvptx/nvptx.h (struct machine_function): New fields
has_simtreg, unisimt_location, simt_stack_size, simt_stack_align.
* config/nvptx/nvptx.md (UNSPECV_SIMT_ENTER): New unspec.
(UNSPECV_SIMT_EXIT): Ditto.
(omp_simt_enter_insn): New insn.
(omp_simt_enter): New expansion.
(omp_simt_exit): New insn.
* config/nvptx/nvptx.opt (msoft-stack-reserve-local): New option.
* internal-fn.c (expand_GOMP_SIMT_ENTER): New.
(expand_GOMP_SIMT_ENTER_ALLOC): New.
(expand_GOMP_SIMT_EXIT): New.
* internal-fn.def (GOMP_SIMT_ENTER): New internal function.
(GOMP_SIMT_ENTER_ALLOC): Ditto.
(GOMP_SIMT_EXIT): Ditto.
* target-insns.def (omp_simt_enter): New insn.
(omp_simt_exit): Ditto.
* omp-low.c (struct omplow_simd_context): New fields simt_eargs,
simt_dlist.
(lower_rec_simd_input_clauses): Implement SIMT privatization.
(lower_rec_input_clauses): Likewise.
(lower_lastprivate_clauses): Handle SIMT privatization.
* omp-offload.c: Include langhooks.h, tree-nested.h, stor-layout.h.
(ompdevlow_adjust_simt_enter): New.
(find_simtpriv_var_op): New.
(execute_omp_device_lower): Handle IFN_GOMP_SIMT_ENTER,
IFN_GOMP_SIMT_ENTER_ALLOC, IFN_GOMP_SIMT_EXIT.
* tree-inline.h (struct copy_body_data): New field dst_simt_vars.
* tree-inline.c (expand_call_inline): Handle SIMT privatization.
(copy_decl_for_dup_finish): Ditto.
* tree-ssa.c (execute_update_addresses_taken): Handle GOMP_SIMT_ENTER.
From-SVN: r246550
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/nvptx/nvptx-protos.h | 2 | ||||
-rw-r--r-- | gcc/config/nvptx/nvptx.c | 163 | ||||
-rw-r--r-- | gcc/config/nvptx/nvptx.h | 6 | ||||
-rw-r--r-- | gcc/config/nvptx/nvptx.md | 39 | ||||
-rw-r--r-- | gcc/config/nvptx/nvptx.opt | 4 |
5 files changed, 196 insertions, 18 deletions
diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h index aaea3ba..16b316f 100644 --- a/gcc/config/nvptx/nvptx-protos.h +++ b/gcc/config/nvptx/nvptx-protos.h @@ -53,5 +53,7 @@ extern const char *nvptx_output_mov_insn (rtx, rtx); extern const char *nvptx_output_call_insn (rtx_insn *, rtx, rtx); extern const char *nvptx_output_return (void); extern const char *nvptx_output_set_softstack (unsigned); +extern const char *nvptx_output_simt_enter (rtx, rtx, rtx); +extern const char *nvptx_output_simt_exit (rtx); #endif #endif diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 647855c..83f4610 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -1048,11 +1048,6 @@ init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size) fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", bits, reg_stack, reg_frame, size); - /* Usually 'crtl->is_leaf' is computed during register allocator - initialization (which is not done on NVPTX) or for pressure-sensitive - optimizations. Initialize it here, except if already set. */ - if (!crtl->is_leaf) - crtl->is_leaf = leaf_function_p (); if (!crtl->is_leaf) fprintf (file, "\t\tst.shared.u%d [%s], %s;\n", bits, reg_sspslot, reg_stack); @@ -1080,24 +1075,29 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name) static void nvptx_init_unisimt_predicate (FILE *file) { + cfun->machine->unisimt_location = gen_reg_rtx (Pmode); + int loc = REGNO (cfun->machine->unisimt_location); int bits = POINTER_SIZE; - int master = REGNO (cfun->machine->unisimt_master); - int pred = REGNO (cfun->machine->unisimt_predicate); + fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc); fprintf (file, "\t{\n"); fprintf (file, "\t\t.reg.u32 %%ustmp0;\n"); fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits); - fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits); fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n"); fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n", bits == 64 ? ".wide" : ".lo"); - fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits); - fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits); - fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master); - fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n"); - /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */ - fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master); - /* Compute predicate as 'tid.x == master'. */ - fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master); + fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc); + fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc); + if (cfun->machine->unisimt_predicate) + { + int master = REGNO (cfun->machine->unisimt_master); + int pred = REGNO (cfun->machine->unisimt_predicate); + fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc); + fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n"); + /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */ + fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master); + /* Compute predicate as 'tid.x == master'. */ + fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master); + } fprintf (file, "\t}\n"); need_unisimt_decl = true; } @@ -1224,6 +1224,12 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) fprintf (file, "%s", s.str().c_str()); + /* Usually 'crtl->is_leaf' is computed during register allocator + initialization (which is not done on NVPTX) or for pressure-sensitive + optimizations. Initialize it here, except if already set. */ + if (!crtl->is_leaf) + crtl->is_leaf = leaf_function_p (); + HOST_WIDE_INT sz = get_frame_size (); bool need_frameptr = sz || cfun->machine->has_chain; int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT; @@ -1240,9 +1246,28 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) init_frame (file, FRAME_POINTER_REGNUM, alignment, ROUND_UP (sz, GET_MODE_SIZE (DImode))); } - else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca) + else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca + || (cfun->machine->has_simtreg && !crtl->is_leaf)) init_softstack_frame (file, alignment, sz); + if (cfun->machine->has_simtreg) + { + unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size; + unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align; + align = MAX (align, GET_MODE_SIZE (DImode)); + if (!crtl->is_leaf || cfun->calls_alloca) + simtsz = HOST_WIDE_INT_M1U; + if (simtsz == HOST_WIDE_INT_M1U) + simtsz = nvptx_softstack_size; + if (cfun->machine->has_softstack) + simtsz += POINTER_SIZE / 8; + simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode)); + if (align > GET_MODE_SIZE (DImode)) + simtsz += align - GET_MODE_SIZE (DImode); + if (simtsz) + fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar[" + HOST_WIDE_INT_PRINT_DEC "];\n", simtsz); + } /* Declare the pseudos we have as ptx registers. */ int maxregs = max_reg_num (); for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++) @@ -1267,10 +1292,112 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) if (cfun->machine->axis_predicate[1]) nvptx_init_axis_predicate (file, REGNO (cfun->machine->axis_predicate[1]), "x"); - if (cfun->machine->unisimt_predicate) + if (cfun->machine->unisimt_predicate + || (cfun->machine->has_simtreg && !crtl->is_leaf)) nvptx_init_unisimt_predicate (file); } +/* Output code for switching uniform-simt state. ENTERING indicates whether + we are entering or leaving non-uniform execution region. */ + +static void +nvptx_output_unisimt_switch (FILE *file, bool entering) +{ + if (crtl->is_leaf && !cfun->machine->unisimt_predicate) + return; + fprintf (file, "\t{\n"); + fprintf (file, "\t\t.reg.u32 %%ustmp2;\n"); + fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0); + if (!crtl->is_leaf) + { + int loc = REGNO (cfun->machine->unisimt_location); + fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc); + } + if (cfun->machine->unisimt_predicate) + { + int master = REGNO (cfun->machine->unisimt_master); + int pred = REGNO (cfun->machine->unisimt_predicate); + fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n"); + fprintf (file, "\t\tmov.u32 %%r%d, %s;\n", + master, entering ? "%ustmp2" : "0"); + fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master); + } + fprintf (file, "\t}\n"); +} + +/* Output code for allocating per-lane storage and switching soft-stack pointer. + ENTERING indicates whether we are entering or leaving non-uniform execution. + PTR is the register pointing to allocated storage, it is assigned to on + entering and used to restore state on leaving. SIZE and ALIGN are used only + on entering. */ + +static void +nvptx_output_softstack_switch (FILE *file, bool entering, + rtx ptr, rtx size, rtx align) +{ + gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr)); + if (crtl->is_leaf && !cfun->machine->simt_stack_size) + return; + int bits = POINTER_SIZE, regno = REGNO (ptr); + fprintf (file, "\t{\n"); + if (entering) + { + fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + " + HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno, + cfun->machine->simt_stack_size); + fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno); + if (CONST_INT_P (size)) + fprintf (file, HOST_WIDE_INT_PRINT_DEC, + ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode))); + else + output_reg (file, REGNO (size), VOIDmode); + fputs (";\n", file); + if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode)) + fprintf (file, "\t\tand.u%d %%r%d, %%r%d, -%d;\n", + bits, regno, regno, UINTVAL (align)); + } + if (cfun->machine->has_softstack) + { + const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; + if (entering) + { + fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n", + bits, regno, bits / 8, reg_stack); + fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n", + bits, reg_stack, regno, bits / 8); + } + else + { + fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n", + bits, reg_stack, regno, bits / 8); + } + nvptx_output_set_softstack (REGNO (stack_pointer_rtx)); + } + fprintf (file, "\t}\n"); +} + +/* Output code to enter non-uniform execution region. DEST is a register + to hold a per-lane allocation given by SIZE and ALIGN. */ + +const char * +nvptx_output_simt_enter (rtx dest, rtx size, rtx align) +{ + nvptx_output_unisimt_switch (asm_out_file, true); + nvptx_output_softstack_switch (asm_out_file, true, dest, size, align); + return ""; +} + +/* Output code to leave non-uniform execution region. SRC is the register + holding per-lane storage previously allocated by omp_simt_enter insn. */ + +const char * +nvptx_output_simt_exit (rtx src) +{ + nvptx_output_unisimt_switch (asm_out_file, false); + nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX); + return ""; +} + /* Output instruction that sets soft stack pointer in shared memory to the value in register given by SRC_REGNO. */ diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index 8338d4e..0a000a7 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -213,12 +213,18 @@ struct GTY(()) machine_function bool has_varadic; /* Current function has a varadic call. */ bool has_chain; /* Current function has outgoing static chain. */ bool has_softstack; /* Current function has a soft stack frame. */ + bool has_simtreg; /* Current function has an OpenMP SIMD region. */ int num_args; /* Number of args of current call. */ int return_mode; /* Return mode of current fn. (machine_mode not defined yet.) */ rtx axis_predicate[2]; /* Neutering predicates. */ rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */ rtx unisimt_predicate; /* Predicate for -muniform-simt. */ + rtx unisimt_location; /* Mask location for -muniform-simt. */ + /* The following two fields hold the maximum size resp. alignment required + for per-lane storage in OpenMP SIMD regions. */ + unsigned HOST_WIDE_INT simt_stack_size; + unsigned HOST_WIDE_INT simt_stack_align; }; #endif diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 50dd42e..f2ed63b 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -63,6 +63,9 @@ UNSPECV_JOIN UNSPECV_NOUNROLL + + UNSPECV_SIMT_ENTER + UNSPECV_SIMT_EXIT ]) (define_attr "subregs_ok" "false,true" @@ -1184,6 +1187,42 @@ ;; Patterns for OpenMP SIMD-via-SIMT lowering +(define_insn "omp_simt_enter_insn" + [(set (match_operand 0 "nvptx_register_operand" "=R") + (unspec_volatile [(match_operand 1 "nvptx_nonmemory_operand" "Ri") + (match_operand 2 "nvptx_nonmemory_operand" "Ri")] + UNSPECV_SIMT_ENTER))] + "" +{ + return nvptx_output_simt_enter (operands[0], operands[1], operands[2]); +}) + +(define_expand "omp_simt_enter" + [(match_operand 0 "nvptx_register_operand" "=R") + (match_operand 1 "nvptx_nonmemory_operand" "Ri") + (match_operand 2 "const_int_operand" "n")] + "" +{ + if (!CONST_INT_P (operands[1])) + cfun->machine->simt_stack_size = HOST_WIDE_INT_M1U; + else + cfun->machine->simt_stack_size = MAX (UINTVAL (operands[1]), + cfun->machine->simt_stack_size); + cfun->machine->simt_stack_align = MAX (UINTVAL (operands[2]), + cfun->machine->simt_stack_align); + cfun->machine->has_simtreg = true; + emit_insn (gen_omp_simt_enter_insn (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "omp_simt_exit" + [(unspec_volatile [(match_operand 0 "nvptx_register_operand" "R")] + UNSPECV_SIMT_EXIT)] + "" +{ + return nvptx_output_simt_exit (operands[0]); +}) + ;; Implement IFN_GOMP_SIMT_LANE: set operand 0 to lane index (define_insn "omp_simt_lane" [(set (match_operand:SI 0 "nvptx_register_operand" "") diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 80aab5b..901def7 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -37,6 +37,10 @@ msoft-stack Target Report Mask(SOFT_STACK) Use custom stacks instead of local memory for automatic storage. +msoft-stack-reserve-local +Target Report Joined RejectNegative UInteger Var(nvptx_softstack_size) Init(128) +Specify size of .local memory used for stack when the exact amount is not known. + muniform-simt Target Report Mask(UNIFORM_SIMT) Generate code that can keep local state uniform across all lanes. |