4 files changed, 101 insertions, 48 deletions
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index fe68678..0287400 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -92,6 +92,8 @@ enum hsaco_attr_type
 /* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag
    for non-scalar memory operations. The string starts on purpose with a space.
    Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used.
+   Note: on atomics, glc/sc0 denotes whether the pre-op operation should
+   be used.
    CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however,
    there is no non-scalar user so far.  */
 #define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc")
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 0994329..a34d2e3 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -3938,6 +3938,7 @@
    v_cmpx%E1\t%2, %3
    v_cmpx%E1\t%2, %3"
   [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+   (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
    (set_attr "length" "4,8,4,8,8,8,4,8")
    (set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
 
@@ -3992,6 +3993,7 @@
    v_cmpx%E1\t%2, %3
    v_cmpx%E1\t%2, %3"
   [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vop3a,vopc,vopc")
+   (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmp,vcmpx,vcmpx")
    (set_attr "length" "4,8,4,8,8,8,4,8")
    (set_attr "rdna" "*,*,no,no,*,*,yes,yes")])
 
@@ -4050,6 +4052,7 @@
    v_cmpx%E1\t%2, %3
    v_cmpx%E1\t%2, %3"
   [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+   (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
    (set_attr "length" "4,8,4,8,8,4,8")
    (set_attr "rdna" "*,*,no,no,*,yes,yes")])
 
@@ -4073,6 +4076,7 @@
    v_cmpx%E1\t%2, %3
    v_cmpx%E1\t%2, %3"
   [(set_attr "type" "vopc,vopc,vopc,vopc,vop3a,vopc,vopc")
+   (set_attr "vcmp" "vcmp,vcmp,vcmpx,vcmpx,vcmp,vcmpx,vcmpx")
    (set_attr "length" "4,8,4,8,8,4,8")
    (set_attr "rdna" "*,*,no,no,*,yes,yes")])
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 8959118..5ffeb23 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -54,6 +54,7 @@
 #include "gimple.h"
 #include "cgraph.h"
 #include "case-cfn-macros.h"
+#include "opts.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -183,6 +184,11 @@ gcn_option_override (void)
 
   if (flag_sram_ecc == HSACO_ATTR_DEFAULT)
     flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default;
+
+  /* TODO: This seems to produce tighter loops, but the testsuites expects it
+     to be set to '2', so I'll leave it default for now.
+  SET_OPTION_IF_UNSET (&global_options, &global_options_set,
+		       param_vect_partial_vector_usage, 1);  */
 }
 
 /* }}}  */
@@ -5789,45 +5795,19 @@ gcn_libc_has_function (enum function_class fn_class,
   return bsd_libc_has_function (fn_class, type);
 }
 
-/* }}}  */
-/* {{{ md_reorg pass.  */
-
-/* Identify V_CMPX from the "type" attribute;
-   note: this will also match 'v_cmp %E1 vcc'.  */
+/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */
 
 static bool
-gcn_cmpx_insn_p (attr_type type)
+gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode),
+			   int ARG_UNUSED (scale),
+			   unsigned int ARG_UNUSED (group_size))
 {
-  switch (type)
-    {
-    case TYPE_VOPC:
-      return true;
-    case TYPE_MUBUF:
-    case TYPE_MTBUF:
-    case TYPE_FLAT:
-    case TYPE_VOP3P_MAI:
-    case TYPE_UNKNOWN:
-    case TYPE_SOP1:
-    case TYPE_SOP2:
-    case TYPE_SOPK:
-    case TYPE_SOPC:
-    case TYPE_SOPP:
-    case TYPE_SMEM:
-    case TYPE_DS:
-    case TYPE_VOP2:
-    case TYPE_VOP1:
-    case TYPE_VOP3A:
-    case TYPE_VOP3B:
-    case TYPE_VOP_SDWA:
-    case TYPE_VOP_DPP:
-    case TYPE_MULT:
-    case TYPE_VMULT:
-      return false;
-    }
-  gcc_unreachable ();
-  return false;
+  return true;
 }
 
+/* }}}  */
+/* {{{ md_reorg pass.  */
+
 /* Identify VMEM instructions from their "type" attribute.  */
 
 static bool
@@ -6356,19 +6336,59 @@ gcn_md_reorg (void)
 		   reg_class_contents[(int)VCC_CONDITIONAL_REG])))
 	    nops_rqd = ivccwait - prev_insn->age;
 
+	  /* NOTE: The following condition for adding wait state exists, but
+	     GCC does not access the special registers using their SGPR#.
+	     Thus, no action is required here.  The following wait-state
+	     condition exists at least for VEGA/gfx900+ to CDNA3:
+		Mixed use of VCC: alias vs. SGPR# - v_readlane,
+		v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale
+		followed by VALU reads VCC as constant requires 1 wait state.
+		(As carry-in, it requires none.)
+		[VCC can be accessed by name or logical SGPR that holds it.]  */
+
+	  /* Testing indicates that CDNA3 requires an s_nop between
+	     e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'.
+	     Thus: add it between v_cmp writing VCC and VALU read of VCC.  */
+	  if (TARGET_CDNA3_NOPS
+	      && (prev_insn->age + nops_rqd) < 1
+	      && iunit == UNIT_VECTOR
+	      && (hard_reg_set_intersect_p
+		  (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG]))
+	      && get_attr_vcmp (prev_insn->insn) == VCMP_VCMP)
+	    nops_rqd = 1 - prev_insn->age;
+
+	  /* CDNA3: VALU writes SGPR/VCC: v_readlane, v_readfirstlane, v_cmp,
+	     v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by:
+	     - VALU reads SGPR as constant requires 1 waite state
+	     - VALU reads SGPR as carry-in requires no waite state
+	     - v_readlane/v_writelane reads SGPR as lane select requires 4 wait
+	       states.  */
+	  if (TARGET_CDNA3_NOPS
+	      && (prev_insn->age + nops_rqd) < 4
+	      && iunit == UNIT_VECTOR
+	      && prev_insn->unit == UNIT_VECTOR
+	      && hard_reg_set_intersect_p
+		   (depregs, reg_class_contents[(int) SGPR_SRC_REGS]))
+	    {
+	      if (get_attr_laneselect (insn) != LANESELECT_NO)
+		nops_rqd = 4 - prev_insn->age;
+	      else if ((prev_insn->age + nops_rqd) < 1)
+		nops_rqd = 1 - prev_insn->age;
+	    }
+
 	  /* CDNA3: v_cmpx followed by
 	     - V_readlane, v_readfirstlane, v_writelane requires 4 wait states
 	     - VALU reads EXEC as constant requires 2 wait states
 	     - other VALU requires no wait state  */
 	  if (TARGET_CDNA3_NOPS
 	      && (prev_insn->age + nops_rqd) < 4
-	      && gcn_cmpx_insn_p (prev_insn->type)
+	      && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
 	      && get_attr_laneselect (insn) != LANESELECT_NO)
 	    nops_rqd = 4 - prev_insn->age;
 	  else if (TARGET_CDNA3_NOPS
 		   && (prev_insn->age + nops_rqd) < 2
 		   && iunit == UNIT_VECTOR
-		   && gcn_cmpx_insn_p (prev_insn->type)
+		   && get_attr_vcmp (prev_insn->insn) == VCMP_VCMPX
 		   && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
 	    nops_rqd = 2 - prev_insn->age;
 
@@ -6436,8 +6456,8 @@ gcn_md_reorg (void)
 	}
 
       /* Insert the required number of NOPs.  */
-      for (int i = nops_rqd; i > 0; i--)
-	emit_insn_after (gen_nop (), last_insn);
+      if (nops_rqd > 0)
+	emit_insn_after (gen_nops (GEN_INT (nops_rqd-1)), last_insn);
 
       /* Age the previous instructions.  We can also ignore writes to
          registers subsequently overwritten.  */
@@ -7283,6 +7303,11 @@ print_operand_address (FILE *file, rtx mem)
    H - print second part of a multi-reg value (high-part of 2-reg value)
    J - print third part of a multi-reg value
    K - print fourth part of a multi-reg value
+   R   Print a scalar register number as an integer.  Temporary hack.
+   V - Print a vector register number as an integer.  Temporary hack.
+
+   Additionally, the standard builtin c, n, a, and l exist; see gccint's
+   "Output Templates and Operand Substitution" for details.
  */
 
 void
@@ -8131,6 +8156,8 @@ gcn_dwarf_register_span (rtx rtl)
   gcn_vectorize_builtin_vectorized_function
 #undef  TARGET_VECTORIZE_GET_MASK_MODE
 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
+#undef  TARGET_VECTORIZE_PREFER_GATHER_SCATTER
+#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter
 #undef  TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
 #undef  TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index fad42e6..4130cf6 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -324,6 +324,11 @@
              "store,storex34,load,atomic,atomicwait,cmpswapx2,no"
              (const_string "no"))
 
+; Identify v_cmp and v_cmpx instructions for "Manually Inserted Wait State"
+; handling.
+
+(define_attr "vcmp" "vcmp,vcmpx,no" (const_string "no"))
+
 ; Identify instructions that require "Manually Inserted Wait State" if
 ; a previous instruction writes to VCC.  The number gives the number of NOPs.
 
@@ -424,6 +429,15 @@
   "s_nop\t0x0"
   [(set_attr "type" "sopp")])
 
+; Variant of 'nop' that accepts a count argument.
+; s_nop accepts 0x0 to 0xf for 1 to 16 nops; however,
+; as %0 prints decimals, only 0 to 9 (= 1 to 10 nops) can be used.
+(define_insn "nops"
+  [(match_operand 0 "const_int_operand")]
+  ""
+  "s_nop\t0x%0"
+  [(set_attr "type" "sopp")])
+
 ; FIXME: What should the value of the immediate be? Zero is disallowed, so
 ; pick 1 for now.
 (define_insn "trap"
@@ -566,6 +580,7 @@
   [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
 		     flat,flat,flat,flat")
    (set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store")
+   (set_attr "vcmp" "*,*,*,*,vcmp,*,*,*,*,*,*,*,*,*,*")
    (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
    (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
    (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")
@@ -1089,6 +1104,7 @@
    s_cmp%D1\t%2, %3
    v_cmp%E1\tvcc, %2, %3"
   [(set_attr "type" "sopc,vopc")
+   (set_attr "vcmp" "vcmp")
    (set_attr "length" "8")])
 
 (define_insn "cstoredi4_vector"
@@ -1099,6 +1115,7 @@
   ""
   "v_cmp%E1\tvcc, %2, %3"
   [(set_attr "type" "vopc")
+   (set_attr "vcmp" "vcmp")
    (set_attr "length" "8")])
 
 (define_expand "cbranchdi4"
@@ -1125,6 +1142,7 @@
   ""
   "v_cmp%E1\tvcc, %2, %3"
   [(set_attr "type" "vopc")
+   (set_attr "vcmp" "vcmp")
    (set_attr "length" "8")])
 
 (define_expand "cbranch<mode>4"
@@ -2165,7 +2183,7 @@
 		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
 		    : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
@@ -2177,7 +2195,7 @@
 		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
 		    : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
@@ -2224,7 +2242,7 @@
 		    : TARGET_WBINVL1_CACHE
 		    ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1"
 		    : "error: cache architectire unspecified");
 	  case 2:
 	    return (TARGET_GLn_CACHE
@@ -2232,7 +2250,7 @@
 		    : TARGET_WBINVL1_CACHE
 		    ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1"
 		    : "error: cache architecture unspecified");
 	  }
 	break;
@@ -2252,7 +2270,8 @@
 		    ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;"
+		      "flat_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
 		    : "error: cache architecture unspecified");
 	  case 2:
@@ -2263,7 +2282,8 @@
 		    ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
+		      "global_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
 		    : "error: cache architecture unspecified");
 	  }
@@ -2347,7 +2367,7 @@
             ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0"
             : "error: cache architecture unspecified");
 	  case 2:
@@ -2360,7 +2380,7 @@
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)"
             : "error: cache architecture unspecified");
@@ -2382,7 +2402,7 @@
             ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
             : "error: cache architecture unspecified");
 	  case 2:
@@ -2395,7 +2415,7 @@
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
             : "error: cache architecture unspecified");