aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/gcn
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/gcn')
-rw-r--r--gcc/config/gcn/gcn-opts.h2
-rw-r--r--gcc/config/gcn/gcn-valu.md4
-rw-r--r--gcc/config/gcn/gcn.cc103
-rw-r--r--gcc/config/gcn/gcn.md40
4 files changed, 101 insertions, 48 deletions
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index fe68678..0287400 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -92,6 +92,8 @@ enum hsaco_attr_type
/* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag
for non-scalar memory operations. The string starts on purpose with a space.
Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used.
+ Note: on atomics, glc/sc0 denotes whether the pre-op operation should
+ be used.
CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however,
there is no non-scalar user so far. */
#define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc")
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 0994329..a34d2e3 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -3938,6 +3938,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
@@ -3992,6 +3993,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
@@ -4050,6 +4052,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,yes,yes")])
@@ -4073,6 +4076,7 @@
v_cmpx%E1\t%2, %3
v_cmpx%E1\t%2, %3"
[(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+ (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
(set_attr "length" "4,8,4,8,8,4,8")
(set_attr "rdna" "*,*,no,no,*,yes,yes")])
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 8959118..5ffeb23 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -54,6 +54,7 @@
#include "gimple.h"
#include "cgraph.h"
#include "case-cfn-macros.h"
+#include "opts.h"
/* This file should be included last. */
#include "target-def.h"
@@ -183,6 +184,11 @@ gcn_option_override (void)
if (flag_sram_ecc == HSACO_ATTR_DEFAULT)
flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default;
+
+ /* TODO: This seems to produce tighter loops, but the testsuites expects it
+ to be set to '2', so I'll leave it default for now.
+ SET_OPTION_IF_UNSET (&global_options, &global_options_set,
+ param_vect_partial_vector_usage, 1); */
}
/* }}} */
@@ -5789,45 +5795,19 @@ gcn_libc_has_function (enum function_class fn_class,
return bsd_libc_has_function (fn_class, type);
}
-/* }}} */
-/* {{{ md_reorg pass. */
-
-/* Identify V_CMPX from the "type" attribute;
- note: this will also match 'v_cmp %E1 vcc'. */
+/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */
static bool
-gcn_cmpx_insn_p (attr_type type)
+gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode),
+ int ARG_UNUSED (scale),
+ unsigned int ARG_UNUSED (group_size))
{
- switch (type)
- {
- case TYPE_VOPC:
- return true;
- case TYPE_MUBUF:
- case TYPE_MTBUF:
- case TYPE_FLAT:
- case TYPE_VOP3P_MAI:
- case TYPE_UNKNOWN:
- case TYPE_SOP1:
- case TYPE_SOP2:
- case TYPE_SOPK:
- case TYPE_SOPC:
- case TYPE_SOPP:
- case TYPE_SMEM:
- case TYPE_DS:
- case TYPE_VOP2:
- case TYPE_VOP1:
- case TYPE_VOP3A:
- case TYPE_VOP3B:
- case TYPE_VOP_SDWA:
- case TYPE_VOP_DPP:
- case TYPE_MULT:
- case TYPE_VMULT:
- return false;
- }
- gcc_unreachable ();
- return false;
+ return true;
}
+/* }}} */
+/* {{{ md_reorg pass. */
+
/* Identify VMEM instructions from their "type" attribute. */
static bool
@@ -6356,19 +6336,59 @@ gcn_md_reorg (void)
reg_class_contents[(int)VCC_CONDITIONAL_REG])))
nops_rqd = ivccwait - prev_insn->age;
+ /* NOTE: The following condition for adding wait state exists, but
+ GCC does not access the special registers using their SGPR#.
+ Thus, no action is required here. The following wait-state
+ condition exists at least for VEGA/gfx900+ to CDNA3:
+ Mixed use of VCC: alias vs. SGPR# - v_readlane,
+ v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale
+ followed by VALU reads VCC as constant requires 1 wait state.
+ (As carry-in, it requires none.)
+ [VCC can be accessed by name or logical SGPR that holds it.] */
+
+ /* Testing indicates that CDNA3 requires an s_nop between
+ e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'.
+ Thus: add it between v_cmp writing VCC and VALU read of VCC. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && iunit == UNIT_VECTOR
+ && (hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG]))
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMP)
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CDNA3: VALU writes SGPR/VCC: v_readlane, v_readfirstlane, v_cmp,
+ v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by:
+ - VALU reads SGPR as constant requires 1 waite state
+ - VALU reads SGPR as carry-in requires no waite state
+ - v_readlane/v_writelane reads SGPR as lane select requires 4 wait
+ states. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 4
+ && iunit == UNIT_VECTOR
+ && prev_insn->unit == UNIT_VECTOR
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_SRC_REGS]))
+ {
+ if (get_attr_laneselect (insn) != LANESELECT_NO)
+ nops_rqd = 4 - prev_insn->age;
+ else if ((prev_insn->age + nops_rqd) < 1)
+ nops_rqd = 1 - prev_insn->age;
+ }
+
/* CDNA3: v_cmpx followed by
- V_readlane, v_readfirstlane, v_writelane requires 4 wait states
- VALU reads EXEC as constant requires 2 wait states
- other VALU requires no wait state */
if (TARGET_CDNA3_NOPS
&& (prev_insn->age + nops_rqd) < 4
- && gcn_cmpx_insn_p (prev_insn->type)
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
&& get_attr_laneselect (insn) != LANESELECT_NO)
nops_rqd = 4 - prev_insn->age;
else if (TARGET_CDNA3_NOPS
&& (prev_insn->age + nops_rqd) < 2
&& iunit == UNIT_VECTOR
- && gcn_cmpx_insn_p (prev_insn->type)
+ && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
&& TEST_HARD_REG_BIT (ireads, EXECZ_REG))
nops_rqd = 2 - prev_insn->age;
@@ -6436,8 +6456,8 @@ gcn_md_reorg (void)
}
/* Insert the required number of NOPs. */
- for (int i = nops_rqd; i > 0; i--)
- emit_insn_after (gen_nop (), last_insn);
+ if (nops_rqd > 0)
+ emit_insn_after (gen_nops (GEN_INT (nops_rqd-1)), last_insn);
/* Age the previous instructions. We can also ignore writes to
registers subsequently overwritten. */
@@ -7283,6 +7303,11 @@ print_operand_address (FILE *file, rtx mem)
H - print second part of a multi-reg value (high-part of 2-reg value)
J - print third part of a multi-reg value
K - print fourth part of a multi-reg value
+ R Print a scalar register number as an integer. Temporary hack.
+ V - Print a vector register number as an integer. Temporary hack.
+
+ Additionally, the standard builtin c, n, a, and l exist; see gccint's
+ "Output Templates and Operand Substitution" for details.
*/
void
@@ -8131,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl)
gcn_vectorize_builtin_vectorized_function
#undef TARGET_VECTORIZE_GET_MASK_MODE
#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
+#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index fad42e6..4130cf6 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -324,6 +324,11 @@
"store,storex34,load,atomic,atomicwait,cmpswapx2,no"
(const_string "no"))
+; Identify v_cmp and v_cmpx instructions for "Manually Inserted Wait State"
+; handling.
+
+(define_attr "vcmp" "vcmp,vcmpx,no" (const_string "no"))
+
; Identify instructions that require "Manually Inserted Wait State" if
; a previous instruction writes to VCC. The number gives the number of NOPs.
@@ -424,6 +429,15 @@
"s_nop\t0x0"
[(set_attr "type" "sopp")])
+; Variant of 'nop' that accepts a count argument.
+; s_nop accepts 0x0 to 0xf for 1 to 16 nops; however,
+; as %0 prints decimals, only 0 to 9 (= 1 to 10 nops) can be used.
+(define_insn "nops"
+ [(match_operand 0 "const_int_operand")]
+ ""
+ "s_nop\t0x%0"
+ [(set_attr "type" "sopp")])
+
; FIXME: What should the value of the immediate be? Zero is disallowed, so
; pick 1 for now.
(define_insn "trap"
@@ -566,6 +580,7 @@
[(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
flat,flat,flat,flat")
(set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store")
+ (set_attr "vcmp" "*,*,*,*,vcmp,*,*,*,*,*,*,*,*,*,*")
(set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
(set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
(set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")
@@ -1089,6 +1104,7 @@
s_cmp%D1\t%2, %3
v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "sopc,vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_insn "cstoredi4_vector"
@@ -1099,6 +1115,7 @@
""
"v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_expand "cbranchdi4"
@@ -1125,6 +1142,7 @@
""
"v_cmp%E1\tvcc, %2, %3"
[(set_attr "type" "vopc")
+ (set_attr "vcmp" "vcmp")
(set_attr "length" "8")])
(define_expand "cbranch<mode>4"
@@ -2165,7 +2183,7 @@
? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol");
@@ -2177,7 +2195,7 @@
? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
@@ -2224,7 +2242,7 @@
: TARGET_WBINVL1_CACHE
? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1"
: "error: cache architectire unspecified");
case 2:
return (TARGET_GLn_CACHE
@@ -2232,7 +2250,7 @@
: TARGET_WBINVL1_CACHE
? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1"
: "error: cache architecture unspecified");
}
break;
@@ -2252,7 +2270,8 @@
? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;"
+ "flat_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "error: cache architecture unspecified");
case 2:
@@ -2263,7 +2282,8 @@
? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
+ "global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "error: cache architecture unspecified");
}
@@ -2347,7 +2367,7 @@
? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0"
: "error: cache architecture unspecified");
case 2:
@@ -2360,7 +2380,7 @@
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)"
: "error: cache architecture unspecified");
@@ -2382,7 +2402,7 @@
? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
"s_waitcnt\t0\;buffer_inv sc1"
: "error: cache architecture unspecified");
case 2:
@@ -2395,7 +2415,7 @@
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
- ? "buffer_inv sc1\;"
+ ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "error: cache architecture unspecified");