aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/config/gcn/gcn-opts.h5
-rw-r--r--gcc/config/gcn/gcn-valu.md27
-rw-r--r--gcc/config/gcn/gcn.cc168
-rw-r--r--gcc/config/gcn/gcn.md249
4 files changed, 312 insertions, 137 deletions
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index 0bfc786..fe68678 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -82,8 +82,13 @@ enum hsaco_attr_type
#define TARGET_DPP_FULL !TARGET_RDNA2_PLUS
#define TARGET_DPP16 TARGET_RDNA2_PLUS
#define TARGET_DPP8 TARGET_RDNA2_PLUS
+/* Device requires no manually inserted wait states; that's the
+ case for RDNA 2, 3 and 3.5 (but not for RNDA 4). */
+#define TARGET_NO_MANUAL_NOPS TARGET_RDNA2_PLUS
/* Device requires CDNA1-style manually inserted wait states for AVGPRs. */
#define TARGET_AVGPR_CDNA1_NOPS TARGET_CDNA1
+/* Device requires CDNA3-style manually inserted wait states. */
+#define TARGET_CDNA3_NOPS TARGET_CDNA3
/* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag
for non-scalar memory operations. The string starts on purpose with a space.
Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used.
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 3899117..0994329 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -811,7 +811,7 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
; FIXME: 64bit operations really should be splitters, but I am not sure how
; to represent vertical subregs.
@@ -828,7 +828,7 @@
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_expand "vec_set<mode>"
[(set (match_operand:V_MOV 0 "register_operand")
@@ -854,7 +854,7 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_insn "*vec_set<mode>_1"
[(set (match_operand:V_2REG 0 "register_operand" "=v")
@@ -871,7 +871,7 @@
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_insn "vec_duplicate<mode><exec>"
[(set (match_operand:V_1REG 0 "register_operand" "=v")
@@ -910,7 +910,7 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<mode><scalar_mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg")
@@ -922,7 +922,7 @@
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<mode><scalar_mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg")
@@ -934,7 +934,7 @@
[(set_attr "type" "vmult")
(set_attr "length" "32")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<V_1REG:mode><V_1REG_ALT:mode>_nop"
[(set (match_operand:V_1REG_ALT 0 "register_operand" "=v,v")
@@ -1192,6 +1192,7 @@
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "load")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2,*,cdna2")
(set_attr "xnack" "off,off,on,on")])
@@ -1250,6 +1251,7 @@
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "load")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2,*,cdna2")
(set_attr "xnack" "off,off,on,on")])
@@ -1335,6 +1337,7 @@
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "store")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2")])
@@ -1390,6 +1393,7 @@
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "store")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2")])
@@ -3260,7 +3264,8 @@
"flag_unsafe_math_optimizations"
"v_sqrt%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
(define_insn "sqrt<mode>2"
[(set (match_operand:FP 0 "register_operand" "= v")
@@ -3269,7 +3274,8 @@
"flag_unsafe_math_optimizations"
"v_sqrt%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
; These FP unops have f64, f32 and f16 versions.
(define_int_iterator MATH_UNOP_1OR2REG
@@ -3559,7 +3565,8 @@
""
"v_rcp%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the
;; one that matches op3 adjusted for best results in reciprocal division.
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 6cd17d9..8959118 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5792,6 +5792,42 @@ gcn_libc_has_function (enum function_class fn_class,
/* }}} */
/* {{{ md_reorg pass. */
+/* Identify V_CMPX from the "type" attribute;
+ note: this will also match 'v_cmp %E1 vcc'. */
+
+static bool
+gcn_cmpx_insn_p (attr_type type)
+{
+ switch (type)
+ {
+ case TYPE_VOPC:
+ return true;
+ case TYPE_MUBUF:
+ case TYPE_MTBUF:
+ case TYPE_FLAT:
+ case TYPE_VOP3P_MAI:
+ case TYPE_UNKNOWN:
+ case TYPE_SOP1:
+ case TYPE_SOP2:
+ case TYPE_SOPK:
+ case TYPE_SOPC:
+ case TYPE_SOPP:
+ case TYPE_SMEM:
+ case TYPE_DS:
+ case TYPE_VOP2:
+ case TYPE_VOP1:
+ case TYPE_VOP3A:
+ case TYPE_VOP3B:
+ case TYPE_VOP_SDWA:
+ case TYPE_VOP_DPP:
+ case TYPE_MULT:
+ case TYPE_VMULT:
+ return false;
+ }
+ gcc_unreachable ();
+ return false;
+}
+
/* Identify VMEM instructions from their "type" attribute. */
static bool
@@ -6152,12 +6188,22 @@ gcn_md_reorg (void)
detects the missed cases, and inserts the documented number of NOPs
required for correct execution. */
+ /* RDNA4 (not yet implemented) differs from RNDA 2/3/3.5 and requires some
+ s_nop, see 5.7 and esp. 5.7.2. in its ISA manual.
+ The assert here is a reminder to add those. */
+ STATIC_ASSERT (ISA_CDNA1 - ISA_RDNA3 == 1);
+
+ if (TARGET_NO_MANUAL_NOPS)
+ return;
+
const int max_waits = 5;
struct ilist
{
rtx_insn *insn;
attr_unit unit;
- attr_delayeduse delayeduse;
+ attr_type type;
+ attr_flatmemaccess flatmemaccess;
+ bool delayeduse;
HARD_REG_SET writes;
HARD_REG_SET reads;
int age;
@@ -6178,7 +6224,29 @@ gcn_md_reorg (void)
attr_type itype = get_attr_type (insn);
attr_unit iunit = get_attr_unit (insn);
- attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
+ attr_flatmemaccess iflatmemaccess = get_attr_flatmemaccess (insn);
+ bool delayeduse;
+ if (TARGET_CDNA3_NOPS)
+ switch (iflatmemaccess)
+ {
+ case FLATMEMACCESS_STORE:
+ case FLATMEMACCESS_STOREX34:
+ case FLATMEMACCESS_ATOMIC:
+ case FLATMEMACCESS_CMPSWAPX2:
+ delayeduse = true;
+ break;
+ case FLATMEMACCESS_LOAD:
+ case FLATMEMACCESS_ATOMICWAIT:
+ case FLATMEMACCESS_NO:
+ delayeduse = false;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ else
+ delayeduse = (iflatmemaccess == FLATMEMACCESS_CMPSWAPX2
+ || iflatmemaccess == FLATMEMACCESS_STOREX34);
+
int ivccwait = get_attr_vccwait (insn);
HARD_REG_SET ireads, iwrites;
CLEAR_HARD_REG_SET (ireads);
@@ -6223,16 +6291,26 @@ gcn_md_reorg (void)
&& TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
nops_rqd = 5 - prev_insn->age;
- /* VALU writes SGPR/VCC followed by v_{read,write}lane using
- SGPR/VCC as lane select requires 4 wait states. */
+ /* VALU writes SGPR/VCC followed by
+ - v_{read,write}lane using SGPR/VCC as lane select requires
+ 4 wait states
+ - [CDNA3] VALU reads SGPR as constant requires 1 wait state
+ - [CDNA3] VALU reads SGPR as carry-in requires no wait states */
if ((prev_insn->age + nops_rqd) < 4
&& prev_insn->unit == UNIT_VECTOR
- && get_attr_laneselect (insn) == LANESELECT_YES
+ && get_attr_laneselect (insn) != LANESELECT_NO
&& (hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) SGPR_REGS])
|| hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) VCC_CONDITIONAL_REG])))
nops_rqd = 4 - prev_insn->age;
+ else if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && iunit == UNIT_VECTOR
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_REGS]))
+ nops_rqd = 1 - prev_insn->age;
/* VALU writes VGPR followed by VALU_DPP reading that VGPR
requires 2 wait states. */
@@ -6245,22 +6323,88 @@ gcn_md_reorg (void)
nops_rqd = 2 - prev_insn->age;
}
+ /* VALU writes EXEC followed by VALU DPP op requires 5 nop. */
+ if ((prev_insn->age + nops_rqd) < 5
+ && itype == TYPE_VOP_DPP
+ && prev_insn->unit == UNIT_VECTOR
+ && TEST_HARD_REG_BIT (prev_insn->writes, EXECZ_REG))
+ nops_rqd = 5 - prev_insn->age;
+
/* Store that requires input registers are not overwritten by
- following instruction. */
- if ((prev_insn->age + nops_rqd) < 1
- && prev_insn->delayeduse == DELAYEDUSE_YES
+ following instruction.
+ For CDNA3, only, VALU writes require 2 not 1 nop.
+ CDNA3 additionally requires that 1 or 2 nop for global & scatch
+ store/atomic. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 2
+ && prev_insn->delayeduse
+ && iunit == UNIT_VECTOR
+ && ((hard_reg_set_intersect_p
+ (prev_insn->reads, iwrites))))
+ nops_rqd = 2 - prev_insn->age;
+ else if ((prev_insn->age + nops_rqd) < 1
+ && prev_insn->delayeduse
&& ((hard_reg_set_intersect_p
(prev_insn->reads, iwrites))))
nops_rqd = 1 - prev_insn->age;
- /* Instruction that requires VCC is not written too close before
- using it. */
+ /* Instruction (such as v_div_fmas) that requires VCC is not written
+ too close before using it */
if (prev_insn->age < ivccwait
&& (hard_reg_set_intersect_p
(prev_insn->writes,
reg_class_contents[(int)VCC_CONDITIONAL_REG])))
nops_rqd = ivccwait - prev_insn->age;
+ /* CDNA3: v_cmpx followed by
+ - V_readlane, v_readfirstlane, v_writelane requires 4 wait states
+ - VALU reads EXEC as constant requires 2 wait states
+ - other VALU requires no wait state */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 4
+ && gcn_cmpx_insn_p (prev_insn->type)
+ && get_attr_laneselect (insn) != LANESELECT_NO)
+ nops_rqd = 4 - prev_insn->age;
+ else if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 2
+ && iunit == UNIT_VECTOR
+ && gcn_cmpx_insn_p (prev_insn->type)
+ && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
+ nops_rqd = 2 - prev_insn->age;
+
+ /* CDNA3: VALU writes VGPR followed by v_readlane vsrc0 reads VGPRn
+ requires 1 wait state. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && prev_insn->flatmemaccess != FLATMEMACCESS_LOAD
+ && get_attr_laneselect (insn) == LANESELECT_READ
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) VGPR_REGS]))
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CDNA3: VALU op which uses OPSEL or SDWA with changes the result's
+ bit position followed by VALU op consumes result of that op
+ requires 1 wait state.
+ FIXME: Handle OPSEL, once used. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && prev_insn->type == TYPE_VOP_SDWA
+ && !hard_reg_set_empty_p (depregs))
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CNDA3: VALU Trans Op (such as v_rcp_f64) followed by non-trans VALU
+ op consumes result of that op requires 1 wait state. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && iunit == UNIT_VECTOR
+ && get_attr_transop (prev_insn->insn) == TRANSOP_YES
+ && get_attr_transop (insn) == TRANSOP_NO
+ && !hard_reg_set_empty_p (depregs))
+ nops_rqd = 1 - prev_insn->age;
+
/* CDNA1: write VGPR before v_accvgpr_write reads it. */
if (TARGET_AVGPR_CDNA1_NOPS
&& (prev_insn->age + nops_rqd) < 2
@@ -6316,7 +6460,9 @@ gcn_md_reorg (void)
/* Track the current instruction as a previous instruction. */
back[oldest].insn = insn;
back[oldest].unit = iunit;
- back[oldest].delayeduse = idelayeduse;
+ back[oldest].type = itype;
+ back[oldest].flatmemaccess = iflatmemaccess;
+ back[oldest].delayeduse = delayeduse;
back[oldest].writes = iwrites;
back[oldest].reads = ireads;
back[oldest].age = 0;
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 9193461..fad42e6 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -312,18 +312,28 @@
; We need to be able to identify v_readlane and v_writelane with
; SGPR lane selection in order to handle "Manually Inserted Wait States".
-(define_attr "laneselect" "yes,no" (const_string "no"))
+(define_attr "laneselect" "write,read,no" (const_string "no"))
-; Identify instructions that require a "Manually Inserted Wait State" if
-; their inputs are overwritten by subsequent instructions.
+; Global or flat memory access using store or load followed by waitcnt
+; and using flat/global atomic access, possibly followed by a waitcnt.
+; 'storex34' denotes FLAT_STORE_X{3,4}.
+; 'cmpswapx2' denotes FLAT_ATOMIC_{F}CMPSWAP_X2
+; Used to handle "Manually Inserted Wait State".
-(define_attr "delayeduse" "yes,no" (const_string "no"))
+(define_attr "flatmemaccess"
+ "store,storex34,load,atomic,atomicwait,cmpswapx2,no"
+ (const_string "no"))
; Identify instructions that require "Manually Inserted Wait State" if
; a previous instruction writes to VCC. The number gives the number of NOPs.
(define_attr "vccwait" "" (const_int 0))
+; Mark trans ops such as v_{exp,rsq,sqrt,sin,cos,log,...}_F{16,32,64}
+; for later conditional s_nop insertion.
+
+(define_attr "transop" "yes,no" (const_string "no"))
+
;; }}}
;; {{{ Iterators useful across the wole machine description
@@ -555,9 +565,11 @@
}
[(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
flat,flat,flat,flat")
+ (set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store")
(set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
(set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
- (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")])
+ (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")
+ (set_attr "laneselect" "*,*,read,*,*,*,*,*,*,*,*,*,*,*,*")])
; 32bit move pattern
@@ -565,38 +577,38 @@
[(set (match_operand:SISF 0 "nonimmediate_operand")
(match_operand:SISF 1 "gcn_load_operand"))]
""
- {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack]
- [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1
- [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,RB ;smem ,* ,12,* ,off] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
- [&SD ,RB ;smem ,* ,12,* ,on ] ^
- [RB ,Sm ;smem ,* ,12,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0
- [Sm ,RS ;smem ,* ,12,* ,off] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm ,RS ;smem ,* ,12,* ,on ] ^
- [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dword\t%1, %A0
- [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1
- [Sg ,v ;vop3a,none,8 ,* ,* ] v_readlane_b32\t%0, %1, 0
- [v ,Sv ;vop3a,none,8 ,* ,* ] v_writelane_b32\t%0, %1, 0
- [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1
- [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1
- [a ,a ;vop1 ,* ,4,cdna2,* ] v_accvgpr_mov_b32\t%0, %1
- [v ,RF ;flat ,* ,12,* ,off] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a ,RF ;flat ,* ,12,cdna2,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store_dword\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1
- [RLRG,v ;ds ,* ,12,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,* ,12,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [SD ,Y ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [v ,RM ;flat ,* ,12,* ,off] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a ,RM ;flat ,* ,12,cdna2,on ] ^
- [RM ,v ;flat ,* ,12,* ,* ] global_store_dword\t%A0, %1%O0%g0
- [RM ,a ;flat ,* ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess]
+ [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1
+ [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,RB ;smem ,* ,12,* ,off,* ,* ] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
+ [&SD ,RB ;smem ,* ,12,* ,on ,* ,* ] ^
+ [RB ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0
+ [Sm ,RS ;smem ,* ,12,* ,off,* ,* ] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm ,RS ;smem ,* ,12,* ,on ,* ,* ] ^
+ [RS ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_store_dword\t%1, %A0
+ [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [Sg ,v ;vop3a,none,8 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0
+ [v ,Sv ;vop3a,none,8 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0
+ [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1
+ [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,* ,4,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1
+ [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store_dword\t%A0, %1%O0%g0
+ [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
+ [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [SD ,Y ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store_dword\t%A0, %1%O0%g0
+ [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
})
; 8/16bit move pattern
@@ -606,31 +618,31 @@
[(set (match_operand:QIHI 0 "nonimmediate_operand")
(match_operand:QIHI 1 "gcn_load_operand"))]
"gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
- {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack]
- [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1
- [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1
- [Sg ,v ;vop3a,none,4 ,* ,* ] v_readlane_b32\t%0, %1, 0
- [v ,Sv ;vop3a,none,4 ,* ,* ] v_writelane_b32\t%0, %1, 0
- [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1
- [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1
- [a ,a ;vop1 ,* ,8,cdna2,* ] v_accvgpr_mov_b32\t%0, %1
- [v ,RF ;flat ,* ,12,* ,off] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a ,RF ;flat ,* ,12,cdna2,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store%s0\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1
- [RLRG,v ;ds ,* ,12,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,* ,12,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,RM ;flat ,* ,12,* ,off] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a ,RM ;flat ,* ,12,cdna2,on ] ^
- [RM ,v ;flat ,* ,12,* ,* ] global_store%s0\t%A0, %1%O0%g0
- [RM ,a ;flat ,* ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess]
+ [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1
+ [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [Sg ,v ;vop3a,none,4 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0
+ [v ,Sv ;vop3a,none,4 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0
+ [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1
+ [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,* ,8,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1
+ [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store%s0\t%A0, %1%O0%g0
+ [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
+ [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store%s0\t%A0, %1%O0%g0
+ [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
})
; 64bit move pattern
@@ -639,34 +651,34 @@
[(set (match_operand:DIDF 0 "nonimmediate_operand")
(match_operand:DIDF 1 "general_operand"))]
"GET_CODE(operands[1]) != SYMBOL_REF"
- {@ [cons: =0, 1; attrs: type, length, cdna, xnack]
- [SD ,SSA ;sop1 ,4 ,* ,* ] s_mov_b64\t%0, %1
- [SD ,C ;sop1 ,8 ,* ,* ] ^
- [SD ,DB ;mult ,* ,* ,* ] #
- [RS ,Sm ;smem ,12,* ,* ] s_store_dwordx2\t%1, %A0
- [Sm ,RS ;smem ,12,* ,off] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm ,RS ;smem ,12,* ,on ] ^
- [v ,v ;vmult,* ,* ,* ] #
- [v ,DB ;vmult,* ,* ,* ] #
- [Sg ,v ;vmult,* ,* ,* ] #
- [v ,Sv ;vmult,* ,* ,* ] #
- [v ,^a ;vmult,* ,* ,* ] #
- [a ,v ;vmult,* ,* ,* ] #
- [a ,a ;vmult,* ,cdna2,* ] #
- [v ,RF ;flat ,12,* ,off] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,12,* ,on ] ^
- [^a ,RF ;flat ,12,cdna2,off] ^
- [&^a ,RF ;flat ,12,cdna2,on ] ^
- [RF ,v ;flat ,12,* ,* ] flat_store_dwordx2\t%A0, %1%O0%g0
- [RF ,a ;flat ,12,cdna2,* ] ^
- [RLRG,v ;ds ,12,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,12,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,RM ;flat ,12,* ,off] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,12,* ,on ] ^
- [^a ,RM ;flat ,12,cdna2,off] ^
- [&^a ,RM ;flat ,12,cdna2,on ] ^
- [RM ,v ;flat ,12,* ,* ] global_store_dwordx2\t%A0, %1%O0%g0
- [RM ,a ;flat ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess]
+ [SD ,SSA ;sop1 ,4 ,* ,* ,* ] s_mov_b64\t%0, %1
+ [SD ,C ;sop1 ,8 ,* ,* ,* ] ^
+ [SD ,DB ;mult ,* ,* ,* ,* ] #
+ [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx2\t%1, %A0
+ [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm ,RS ;smem ,12,* ,on ,* ] ^
+ [v ,v ;vmult,* ,* ,* ,* ] #
+ [v ,DB ;vmult,* ,* ,* ,* ] #
+ [Sg ,v ;vmult,* ,* ,* ,* ] #
+ [v ,Sv ;vmult,* ,* ,* ,* ] #
+ [v ,^a ;vmult,* ,* ,* ,* ] #
+ [a ,v ;vmult,* ,* ,* ,* ] #
+ [a ,a ;vmult,* ,cdna2,* ,* ] #
+ [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,12,* ,on ,load ] ^
+ [^a ,RF ;flat ,12,cdna2,off,load ] ^
+ [&^a ,RF ;flat ,12,cdna2,on ,load ] ^
+ [RF ,v ;flat ,12,* ,* ,store] flat_store_dwordx2\t%A0, %1%O0%g0
+ [RF ,a ;flat ,12,cdna2,* ,store] ^
+ [RLRG,v ;ds ,12,* ,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,12,* ,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,12,* ,on ,load ] ^
+ [^a ,RM ;flat ,12,cdna2,off,load ] ^
+ [&^a ,RM ;flat ,12,cdna2,on ,load ] ^
+ [RM ,v ;flat ,12,* ,* ,store] global_store_dwordx2\t%A0, %1%O0%g0
+ [RM ,a ;flat ,12,cdna2,* ,store] ^
}
"reload_completed
&& ((!MEM_P (operands[0]) && !MEM_P (operands[1])
@@ -704,31 +716,31 @@
[(set (match_operand:TI 0 "nonimmediate_operand")
(match_operand:TI 1 "general_operand" ))]
""
- {@ [cons: =0, 1; attrs: type, delayeduse, length, cdna, xnack]
- [SD ,SSB;mult ,* ,* ,* ,* ] #
- [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dwordx4\t%1, %A0
- [Sm ,RS ;smem ,yes,12,* ,off] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm,RS ;smem ,yes,12,* ,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store_dwordx4\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,RF ;flat ,* ,12,* ,off] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a,RF ;flat ,* ,12,cdna2,on ] ^
- [v ,v ;vmult,* ,* ,* ,* ] #
- [v ,Sv ;vmult,* ,* ,* ,* ] #
- [SD ,v ;vmult,* ,* ,* ,* ] #
- [RM ,v ;flat ,yes,12,* ,* ] global_store_dwordx4\t%A0, %1%O0%g0
- [RM ,a ;flat ,yes,12,cdna2,* ] ^
- [v ,RM ;flat ,* ,12,* ,off] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a,RM ;flat ,* ,12,cdna2,on ] ^
- [RL ,v ;ds ,* ,12,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RL ;ds ,* ,12,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,^a ;vmult,* ,* ,* ,* ] #
- [a ,v ;vmult,* ,* ,* ,* ] #
- [a ,a ;vmult,* ,* ,cdna2,* ] #
+ {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess]
+ [SD ,SSB;mult ,* ,* ,* ,* ] #
+ [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx4\t%1, %A0
+ [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm,RS ;smem ,12,* ,on ,* ] ^
+ [RF ,v ;flat ,12,* ,* ,storex34] flat_store_dwordx4\t%A0, %1%O0%g0
+ [RF ,a ;flat ,12,cdna2,* ,storex34] ^
+ [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,12,* ,on ,load ] ^
+ [^a ,RF ;flat ,12,cdna2,off,load ] ^
+ [&^a,RF ;flat ,12,cdna2,on ,load ] ^
+ [v ,v ;vmult,* ,* ,* ,* ] #
+ [v ,Sv ;vmult,* ,* ,* ,* ] #
+ [SD ,v ;vmult,* ,* ,* ,* ] #
+ [RM ,v ;flat ,12,* ,* ,storex34] global_store_dwordx4\t%A0, %1%O0%g0
+ [RM ,a ;flat ,12,cdna2,* ,storex34] ^
+ [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,12,* ,on ,load ] ^
+ [^a ,RM ;flat ,12,cdna2,off,load ] ^
+ [&^a,RM ;flat ,12,cdna2,on ,load ] ^
+ [RL ,v ;ds ,12,* ,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RL ;ds ,12,* ,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,^a ;vmult,* ,* ,* ,* ] #
+ [a ,v ;vmult,* ,* ,* ,* ] #
+ [a ,a ;vmult,* ,cdna2,* ,* ] #
}
"reload_completed
&& REG_P (operands[0])
@@ -1985,6 +1997,7 @@
flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 %G2\;s_waitcnt\t0
global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "12")])
; FIXME: These patterns are disabled because the instructions don't
@@ -2006,6 +2019,7 @@
flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "12")])
(define_mode_attr x2 [(SI "DI") (DI "TI")])
@@ -2053,7 +2067,7 @@
global_atomic_cmpswap<X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
(set_attr "length" "12")
- (set_attr "delayeduse" "*,yes,yes")])
+ (set_attr "flatmemaccess" "*,cmpswapx2,cmpswapx2")])
(define_insn "sync_compare_and_swap<mode>_lds_insn"
[(set (match_operand:SIDI 0 "register_operand" "= v")
@@ -2173,6 +2187,7 @@
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,load,load")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])
@@ -2257,6 +2272,7 @@
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,store,store")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])
@@ -2389,6 +2405,7 @@
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])