diff options
Diffstat (limited to 'gcc/config/gcn')
-rw-r--r-- | gcc/config/gcn/gcn-opts.h | 2 | ||||
-rw-r--r-- | gcc/config/gcn/gcn-valu.md | 4 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.cc | 103 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.md | 40 |
4 files changed, 101 insertions, 48 deletions
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index fe68678..0287400 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -92,6 +92,8 @@ enum hsaco_attr_type /* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag for non-scalar memory operations. The string starts on purpose with a space. Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used. + Note: on atomics, glc/sc0 denotes whether the pre-op operation should + be used. CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however, there is no non-scalar user so far. */ #define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc") diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 0994329..a34d2e3 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -3938,6 +3938,7 @@ v_cmpx%E1\t%2, %3 v_cmpx%E1\t%2, %3" [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc") + (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx") (set_attr "length" "4,8,4,8,8,8,4,8") (set_attr "rdna" "*,*,no,no,*,*,yes,yes")]) @@ -3992,6 +3993,7 @@ v_cmpx%E1\t%2, %3 v_cmpx%E1\t%2, %3" [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc") + (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx") (set_attr "length" "4,8,4,8,8,8,4,8") (set_attr "rdna" "*,*,no,no,*,*,yes,yes")]) @@ -4050,6 +4052,7 @@ v_cmpx%E1\t%2, %3 v_cmpx%E1\t%2, %3" [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc") + (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx") (set_attr "length" "4,8,4,8,8,4,8") (set_attr "rdna" "*,*,no,no,*,yes,yes")]) @@ -4073,6 +4076,7 @@ v_cmpx%E1\t%2, %3 v_cmpx%E1\t%2, %3" [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc") + (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx") (set_attr "length" "4,8,4,8,8,4,8") (set_attr "rdna" "*,*,no,no,*,yes,yes")]) diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 8959118..5ffeb23 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -54,6 +54,7 @@ #include "gimple.h" #include "cgraph.h" #include "case-cfn-macros.h" +#include "opts.h" /* This file should be included last. */ #include "target-def.h" @@ -183,6 +184,11 @@ gcn_option_override (void) if (flag_sram_ecc == HSACO_ATTR_DEFAULT) flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default; + + /* TODO: This seems to produce tighter loops, but the testsuites expects it + to be set to '2', so I'll leave it default for now. + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_vect_partial_vector_usage, 1); */ } /* }}} */ @@ -5789,45 +5795,19 @@ gcn_libc_has_function (enum function_class fn_class, return bsd_libc_has_function (fn_class, type); } -/* }}} */ -/* {{{ md_reorg pass. */ - -/* Identify V_CMPX from the "type" attribute; - note: this will also match 'v_cmp %E1 vcc'. */ +/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */ static bool -gcn_cmpx_insn_p (attr_type type) +gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode), + int ARG_UNUSED (scale), + unsigned int ARG_UNUSED (group_size)) { - switch (type) - { - case TYPE_VOPC: - return true; - case TYPE_MUBUF: - case TYPE_MTBUF: - case TYPE_FLAT: - case TYPE_VOP3P_MAI: - case TYPE_UNKNOWN: - case TYPE_SOP1: - case TYPE_SOP2: - case TYPE_SOPK: - case TYPE_SOPC: - case TYPE_SOPP: - case TYPE_SMEM: - case TYPE_DS: - case TYPE_VOP2: - case TYPE_VOP1: - case TYPE_VOP3A: - case TYPE_VOP3B: - case TYPE_VOP_SDWA: - case TYPE_VOP_DPP: - case TYPE_MULT: - case TYPE_VMULT: - return false; - } - gcc_unreachable (); - return false; + return true; } +/* }}} */ +/* {{{ md_reorg pass. */ + /* Identify VMEM instructions from their "type" attribute. */ static bool @@ -6356,19 +6336,59 @@ gcn_md_reorg (void) reg_class_contents[(int)VCC_CONDITIONAL_REG]))) nops_rqd = ivccwait - prev_insn->age; + /* NOTE: The following condition for adding wait state exists, but + GCC does not access the special registers using their SGPR#. + Thus, no action is required here. The following wait-state + condition exists at least for VEGA/gfx900+ to CDNA3: + Mixed use of VCC: alias vs. SGPR# - v_readlane, + v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale + followed by VALU reads VCC as constant requires 1 wait state. + (As carry-in, it requires none.) + [VCC can be accessed by name or logical SGPR that holds it.] */ + + /* Testing indicates that CDNA3 requires an s_nop between + e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'. + Thus: add it between v_cmp writing VCC and VALU read of VCC. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && iunit == UNIT_VECTOR + && (hard_reg_set_intersect_p + (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG])) + && get_attr_vcmp (prev_insn->insn) == VCMP_VCMP) + nops_rqd = 1 - prev_insn->age; + + /* CDNA3: VALU writes SGPR/VCC: v_readlane, v_readfirstlane, v_cmp, + v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by: + - VALU reads SGPR as constant requires 1 waite state + - VALU reads SGPR as carry-in requires no waite state + - v_readlane/v_writelane reads SGPR as lane select requires 4 wait + states. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 4 + && iunit == UNIT_VECTOR + && prev_insn->unit == UNIT_VECTOR + && hard_reg_set_intersect_p + (depregs, reg_class_contents[(int) SGPR_SRC_REGS])) + { + if (get_attr_laneselect (insn) != LANESELECT_NO) + nops_rqd = 4 - prev_insn->age; + else if ((prev_insn->age + nops_rqd) < 1) + nops_rqd = 1 - prev_insn->age; + } + /* CDNA3: v_cmpx followed by - V_readlane, v_readfirstlane, v_writelane requires 4 wait states - VALU reads EXEC as constant requires 2 wait states - other VALU requires no wait state */ if (TARGET_CDNA3_NOPS && (prev_insn->age + nops_rqd) < 4 - && gcn_cmpx_insn_p (prev_insn->type) + && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX && get_attr_laneselect (insn) != LANESELECT_NO) nops_rqd = 4 - prev_insn->age; else if (TARGET_CDNA3_NOPS && (prev_insn->age + nops_rqd) < 2 && iunit == UNIT_VECTOR - && gcn_cmpx_insn_p (prev_insn->type) + && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX && TEST_HARD_REG_BIT (ireads, EXECZ_REG)) nops_rqd = 2 - prev_insn->age; @@ -6436,8 +6456,8 @@ gcn_md_reorg (void) } /* Insert the required number of NOPs. */ - for (int i = nops_rqd; i > 0; i--) - emit_insn_after (gen_nop (), last_insn); + if (nops_rqd > 0) + emit_insn_after (gen_nops (GEN_INT (nops_rqd-1)), last_insn); /* Age the previous instructions. We can also ignore writes to registers subsequently overwritten. */ @@ -7283,6 +7303,11 @@ print_operand_address (FILE *file, rtx mem) H - print second part of a multi-reg value (high-part of 2-reg value) J - print third part of a multi-reg value K - print fourth part of a multi-reg value + R Print a scalar register number as an integer. Temporary hack. + V - Print a vector register number as an integer. Temporary hack. + + Additionally, the standard builtin c, n, a, and l exist; see gccint's + "Output Templates and Operand Substitution" for details. */ void @@ -8131,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl) gcn_vectorize_builtin_vectorized_function #undef TARGET_VECTORIZE_GET_MASK_MODE #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode +#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER +#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index fad42e6..4130cf6 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -324,6 +324,11 @@ "store,storex34,load,atomic,atomicwait,cmpswapx2,no" (const_string "no")) +; Identify v_cmp and v_cmpx instructions for "Manually Inserted Wait State" +; handling. + +(define_attr "vcmp" "vcmp,vcmpx,no" (const_string "no")) + ; Identify instructions that require "Manually Inserted Wait State" if ; a previous instruction writes to VCC. The number gives the number of NOPs. @@ -424,6 +429,15 @@ "s_nop\t0x0" [(set_attr "type" "sopp")]) +; Variant of 'nop' that accepts a count argument. +; s_nop accepts 0x0 to 0xf for 1 to 16 nops; however, +; as %0 prints decimals, only 0 to 9 (= 1 to 10 nops) can be used. +(define_insn "nops" + [(match_operand 0 "const_int_operand")] + "" + "s_nop\t0x%0" + [(set_attr "type" "sopp")]) + ; FIXME: What should the value of the immediate be? Zero is disallowed, so ; pick 1 for now. (define_insn "trap" @@ -566,6 +580,7 @@ [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat, flat,flat,flat,flat") (set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store") + (set_attr "vcmp" "*,*,*,*,vcmp,*,*,*,*,*,*,*,*,*,*") (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*") (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12") (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*") @@ -1089,6 +1104,7 @@ s_cmp%D1\t%2, %3 v_cmp%E1\tvcc, %2, %3" [(set_attr "type" "sopc,vopc") + (set_attr "vcmp" "vcmp") (set_attr "length" "8")]) (define_insn "cstoredi4_vector" @@ -1099,6 +1115,7 @@ "" "v_cmp%E1\tvcc, %2, %3" [(set_attr "type" "vopc") + (set_attr "vcmp" "vcmp") (set_attr "length" "8")]) (define_expand "cbranchdi4" @@ -1125,6 +1142,7 @@ "" "v_cmp%E1\tvcc, %2, %3" [(set_attr "type" "vopc") + (set_attr "vcmp" "vcmp") (set_attr "length" "8")]) (define_expand "cbranch<mode>4" @@ -2165,7 +2183,7 @@ ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol"); @@ -2177,7 +2195,7 @@ ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"); @@ -2224,7 +2242,7 @@ : TARGET_WBINVL1_CACHE ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1" : "error: cache architectire unspecified"); case 2: return (TARGET_GLn_CACHE @@ -2232,7 +2250,7 @@ : TARGET_WBINVL1_CACHE ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1" : "error: cache architecture unspecified"); } break; @@ -2252,7 +2270,8 @@ ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;" + "flat_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "error: cache architecture unspecified"); case 2: @@ -2263,7 +2282,8 @@ ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" + "global_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "error: cache architecture unspecified"); } @@ -2347,7 +2367,7 @@ ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0" : "error: cache architecture unspecified"); case 2: @@ -2360,7 +2380,7 @@ "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)" : "error: cache architecture unspecified"); @@ -2382,7 +2402,7 @@ ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "error: cache architecture unspecified"); case 2: @@ -2395,7 +2415,7 @@ "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "error: cache architecture unspecified"); |