diff options
-rw-r--r-- | gcc/config/gcn/gcn-opts.h | 5 | ||||
-rw-r--r-- | gcc/config/gcn/gcn-valu.md | 27 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.cc | 168 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.md | 249 |
4 files changed, 312 insertions, 137 deletions
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index 0bfc786..fe68678 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -82,8 +82,13 @@ enum hsaco_attr_type #define TARGET_DPP_FULL !TARGET_RDNA2_PLUS #define TARGET_DPP16 TARGET_RDNA2_PLUS #define TARGET_DPP8 TARGET_RDNA2_PLUS +/* Device requires no manually inserted wait states; that's the + case for RDNA 2, 3 and 3.5 (but not for RNDA 4). */ +#define TARGET_NO_MANUAL_NOPS TARGET_RDNA2_PLUS /* Device requires CDNA1-style manually inserted wait states for AVGPRs. */ #define TARGET_AVGPR_CDNA1_NOPS TARGET_CDNA1 +/* Device requires CDNA3-style manually inserted wait states. */ +#define TARGET_CDNA3_NOPS TARGET_CDNA3 /* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag for non-scalar memory operations. The string starts on purpose with a space. Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used. diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 3899117..0994329 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -811,7 +811,7 @@ [(set_attr "type" "vop3a") (set_attr "length" "8") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "write")]) ; FIXME: 64bit operations really should be splitters, but I am not sure how ; to represent vertical subregs. @@ -828,7 +828,7 @@ [(set_attr "type" "vmult") (set_attr "length" "16") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "write")]) (define_expand "vec_set<mode>" [(set (match_operand:V_MOV 0 "register_operand") @@ -854,7 +854,7 @@ [(set_attr "type" "vop3a") (set_attr "length" "8") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "write")]) (define_insn "*vec_set<mode>_1" [(set (match_operand:V_2REG 0 "register_operand" "=v") @@ -871,7 +871,7 @@ [(set_attr "type" "vmult") (set_attr "length" "16") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "write")]) (define_insn "vec_duplicate<mode><exec>" [(set (match_operand:V_1REG 0 "register_operand" "=v") @@ -910,7 +910,7 @@ [(set_attr "type" "vop3a") (set_attr "length" "8") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "read")]) (define_insn "vec_extract<mode><scalar_mode>" [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg") @@ -922,7 +922,7 @@ [(set_attr "type" "vmult") (set_attr "length" "16") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "read")]) (define_insn "vec_extract<mode><scalar_mode>" [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg") @@ -934,7 +934,7 @@ [(set_attr "type" "vmult") (set_attr "length" "32") (set_attr "exec" "none") - (set_attr "laneselect" "yes")]) + (set_attr "laneselect" "read")]) (define_insn "vec_extract<V_1REG:mode><V_1REG_ALT:mode>_nop" [(set (match_operand:V_1REG_ALT 0 "register_operand" "=v,v") @@ -1192,6 +1192,7 @@ return buf; } [(set_attr "type" "flat") + (set_attr "flatmemaccess" "load") (set_attr "length" "12") (set_attr "cdna" "*,cdna2,*,cdna2") (set_attr "xnack" "off,off,on,on")]) @@ -1250,6 +1251,7 @@ return buf; } [(set_attr "type" "flat") + (set_attr "flatmemaccess" "load") (set_attr "length" "12") (set_attr "cdna" "*,cdna2,*,cdna2") (set_attr "xnack" "off,off,on,on")]) @@ -1335,6 +1337,7 @@ return buf; } [(set_attr "type" "flat") + (set_attr "flatmemaccess" "store") (set_attr "length" "12") (set_attr "cdna" "*,cdna2")]) @@ -1390,6 +1393,7 @@ return buf; } [(set_attr "type" "flat") + (set_attr "flatmemaccess" "store") (set_attr "length" "12") (set_attr "cdna" "*,cdna2")]) @@ -3260,7 +3264,8 @@ "flag_unsafe_math_optimizations" "v_sqrt%i0\t%0, %1" [(set_attr "type" "vop1") - (set_attr "length" "8")]) + (set_attr "length" "8") + (set_attr "transop" "yes")]) (define_insn "sqrt<mode>2" [(set (match_operand:FP 0 "register_operand" "= v") @@ -3269,7 +3274,8 @@ "flag_unsafe_math_optimizations" "v_sqrt%i0\t%0, %1" [(set_attr "type" "vop1") - (set_attr "length" "8")]) + (set_attr "length" "8") + (set_attr "transop" "yes")]) ; These FP unops have f64, f32 and f16 versions. (define_int_iterator MATH_UNOP_1OR2REG @@ -3559,7 +3565,8 @@ "" "v_rcp%i0\t%0, %1" [(set_attr "type" "vop1") - (set_attr "length" "8")]) + (set_attr "length" "8") + (set_attr "transop" "yes")]) ;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the ;; one that matches op3 adjusted for best results in reciprocal division. diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 6cd17d9..8959118 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -5792,6 +5792,42 @@ gcn_libc_has_function (enum function_class fn_class, /* }}} */ /* {{{ md_reorg pass. */ +/* Identify V_CMPX from the "type" attribute; + note: this will also match 'v_cmp %E1 vcc'. */ + +static bool +gcn_cmpx_insn_p (attr_type type) +{ + switch (type) + { + case TYPE_VOPC: + return true; + case TYPE_MUBUF: + case TYPE_MTBUF: + case TYPE_FLAT: + case TYPE_VOP3P_MAI: + case TYPE_UNKNOWN: + case TYPE_SOP1: + case TYPE_SOP2: + case TYPE_SOPK: + case TYPE_SOPC: + case TYPE_SOPP: + case TYPE_SMEM: + case TYPE_DS: + case TYPE_VOP2: + case TYPE_VOP1: + case TYPE_VOP3A: + case TYPE_VOP3B: + case TYPE_VOP_SDWA: + case TYPE_VOP_DPP: + case TYPE_MULT: + case TYPE_VMULT: + return false; + } + gcc_unreachable (); + return false; +} + /* Identify VMEM instructions from their "type" attribute. */ static bool @@ -6152,12 +6188,22 @@ gcn_md_reorg (void) detects the missed cases, and inserts the documented number of NOPs required for correct execution. */ + /* RDNA4 (not yet implemented) differs from RNDA 2/3/3.5 and requires some + s_nop, see 5.7 and esp. 5.7.2. in its ISA manual. + The assert here is a reminder to add those. */ + STATIC_ASSERT (ISA_CDNA1 - ISA_RDNA3 == 1); + + if (TARGET_NO_MANUAL_NOPS) + return; + const int max_waits = 5; struct ilist { rtx_insn *insn; attr_unit unit; - attr_delayeduse delayeduse; + attr_type type; + attr_flatmemaccess flatmemaccess; + bool delayeduse; HARD_REG_SET writes; HARD_REG_SET reads; int age; @@ -6178,7 +6224,29 @@ gcn_md_reorg (void) attr_type itype = get_attr_type (insn); attr_unit iunit = get_attr_unit (insn); - attr_delayeduse idelayeduse = get_attr_delayeduse (insn); + attr_flatmemaccess iflatmemaccess = get_attr_flatmemaccess (insn); + bool delayeduse; + if (TARGET_CDNA3_NOPS) + switch (iflatmemaccess) + { + case FLATMEMACCESS_STORE: + case FLATMEMACCESS_STOREX34: + case FLATMEMACCESS_ATOMIC: + case FLATMEMACCESS_CMPSWAPX2: + delayeduse = true; + break; + case FLATMEMACCESS_LOAD: + case FLATMEMACCESS_ATOMICWAIT: + case FLATMEMACCESS_NO: + delayeduse = false; + break; + default: + gcc_unreachable (); + } + else + delayeduse = (iflatmemaccess == FLATMEMACCESS_CMPSWAPX2 + || iflatmemaccess == FLATMEMACCESS_STOREX34); + int ivccwait = get_attr_vccwait (insn); HARD_REG_SET ireads, iwrites; CLEAR_HARD_REG_SET (ireads); @@ -6223,16 +6291,26 @@ gcn_md_reorg (void) && TEST_HARD_REG_BIT (ireads, VCCZ_REG)))) nops_rqd = 5 - prev_insn->age; - /* VALU writes SGPR/VCC followed by v_{read,write}lane using - SGPR/VCC as lane select requires 4 wait states. */ + /* VALU writes SGPR/VCC followed by + - v_{read,write}lane using SGPR/VCC as lane select requires + 4 wait states + - [CDNA3] VALU reads SGPR as constant requires 1 wait state + - [CDNA3] VALU reads SGPR as carry-in requires no wait states */ if ((prev_insn->age + nops_rqd) < 4 && prev_insn->unit == UNIT_VECTOR - && get_attr_laneselect (insn) == LANESELECT_YES + && get_attr_laneselect (insn) != LANESELECT_NO && (hard_reg_set_intersect_p (depregs, reg_class_contents[(int) SGPR_REGS]) || hard_reg_set_intersect_p (depregs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))) nops_rqd = 4 - prev_insn->age; + else if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && prev_insn->unit == UNIT_VECTOR + && iunit == UNIT_VECTOR + && hard_reg_set_intersect_p + (depregs, reg_class_contents[(int) SGPR_REGS])) + nops_rqd = 1 - prev_insn->age; /* VALU writes VGPR followed by VALU_DPP reading that VGPR requires 2 wait states. */ @@ -6245,22 +6323,88 @@ gcn_md_reorg (void) nops_rqd = 2 - prev_insn->age; } + /* VALU writes EXEC followed by VALU DPP op requires 5 nop. */ + if ((prev_insn->age + nops_rqd) < 5 + && itype == TYPE_VOP_DPP + && prev_insn->unit == UNIT_VECTOR + && TEST_HARD_REG_BIT (prev_insn->writes, EXECZ_REG)) + nops_rqd = 5 - prev_insn->age; + /* Store that requires input registers are not overwritten by - following instruction. */ - if ((prev_insn->age + nops_rqd) < 1 - && prev_insn->delayeduse == DELAYEDUSE_YES + following instruction. + For CDNA3, only, VALU writes require 2 not 1 nop. + CDNA3 additionally requires that 1 or 2 nop for global & scatch + store/atomic. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 2 + && prev_insn->delayeduse + && iunit == UNIT_VECTOR + && ((hard_reg_set_intersect_p + (prev_insn->reads, iwrites)))) + nops_rqd = 2 - prev_insn->age; + else if ((prev_insn->age + nops_rqd) < 1 + && prev_insn->delayeduse && ((hard_reg_set_intersect_p (prev_insn->reads, iwrites)))) nops_rqd = 1 - prev_insn->age; - /* Instruction that requires VCC is not written too close before - using it. */ + /* Instruction (such as v_div_fmas) that requires VCC is not written + too close before using it */ if (prev_insn->age < ivccwait && (hard_reg_set_intersect_p (prev_insn->writes, reg_class_contents[(int)VCC_CONDITIONAL_REG]))) nops_rqd = ivccwait - prev_insn->age; + /* CDNA3: v_cmpx followed by + - V_readlane, v_readfirstlane, v_writelane requires 4 wait states + - VALU reads EXEC as constant requires 2 wait states + - other VALU requires no wait state */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 4 + && gcn_cmpx_insn_p (prev_insn->type) + && get_attr_laneselect (insn) != LANESELECT_NO) + nops_rqd = 4 - prev_insn->age; + else if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 2 + && iunit == UNIT_VECTOR + && gcn_cmpx_insn_p (prev_insn->type) + && TEST_HARD_REG_BIT (ireads, EXECZ_REG)) + nops_rqd = 2 - prev_insn->age; + + /* CDNA3: VALU writes VGPR followed by v_readlane vsrc0 reads VGPRn + requires 1 wait state. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && prev_insn->unit == UNIT_VECTOR + && prev_insn->flatmemaccess != FLATMEMACCESS_LOAD + && get_attr_laneselect (insn) == LANESELECT_READ + && hard_reg_set_intersect_p + (depregs, reg_class_contents[(int) VGPR_REGS])) + nops_rqd = 1 - prev_insn->age; + + /* CDNA3: VALU op which uses OPSEL or SDWA with changes the result's + bit position followed by VALU op consumes result of that op + requires 1 wait state. + FIXME: Handle OPSEL, once used. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && prev_insn->unit == UNIT_VECTOR + && prev_insn->type == TYPE_VOP_SDWA + && !hard_reg_set_empty_p (depregs)) + nops_rqd = 1 - prev_insn->age; + + /* CNDA3: VALU Trans Op (such as v_rcp_f64) followed by non-trans VALU + op consumes result of that op requires 1 wait state. */ + if (TARGET_CDNA3_NOPS + && (prev_insn->age + nops_rqd) < 1 + && prev_insn->unit == UNIT_VECTOR + && iunit == UNIT_VECTOR + && get_attr_transop (prev_insn->insn) == TRANSOP_YES + && get_attr_transop (insn) == TRANSOP_NO + && !hard_reg_set_empty_p (depregs)) + nops_rqd = 1 - prev_insn->age; + /* CDNA1: write VGPR before v_accvgpr_write reads it. */ if (TARGET_AVGPR_CDNA1_NOPS && (prev_insn->age + nops_rqd) < 2 @@ -6316,7 +6460,9 @@ gcn_md_reorg (void) /* Track the current instruction as a previous instruction. */ back[oldest].insn = insn; back[oldest].unit = iunit; - back[oldest].delayeduse = idelayeduse; + back[oldest].type = itype; + back[oldest].flatmemaccess = iflatmemaccess; + back[oldest].delayeduse = delayeduse; back[oldest].writes = iwrites; back[oldest].reads = ireads; back[oldest].age = 0; diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 9193461..fad42e6 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -312,18 +312,28 @@ ; We need to be able to identify v_readlane and v_writelane with ; SGPR lane selection in order to handle "Manually Inserted Wait States". -(define_attr "laneselect" "yes,no" (const_string "no")) +(define_attr "laneselect" "write,read,no" (const_string "no")) -; Identify instructions that require a "Manually Inserted Wait State" if -; their inputs are overwritten by subsequent instructions. +; Global or flat memory access using store or load followed by waitcnt +; and using flat/global atomic access, possibly followed by a waitcnt. +; 'storex34' denotes FLAT_STORE_X{3,4}. +; 'cmpswapx2' denotes FLAT_ATOMIC_{F}CMPSWAP_X2 +; Used to handle "Manually Inserted Wait State". -(define_attr "delayeduse" "yes,no" (const_string "no")) +(define_attr "flatmemaccess" + "store,storex34,load,atomic,atomicwait,cmpswapx2,no" + (const_string "no")) ; Identify instructions that require "Manually Inserted Wait State" if ; a previous instruction writes to VCC. The number gives the number of NOPs. (define_attr "vccwait" "" (const_int 0)) +; Mark trans ops such as v_{exp,rsq,sqrt,sin,cos,log,...}_F{16,32,64} +; for later conditional s_nop insertion. + +(define_attr "transop" "yes,no" (const_string "no")) + ;; }}} ;; {{{ Iterators useful across the wole machine description @@ -555,9 +565,11 @@ } [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat, flat,flat,flat,flat") + (set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store") (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*") (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12") - (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")]) + (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*") + (set_attr "laneselect" "*,*,read,*,*,*,*,*,*,*,*,*,*,*,*")]) ; 32bit move pattern @@ -565,38 +577,38 @@ [(set (match_operand:SISF 0 "nonimmediate_operand") (match_operand:SISF 1 "gcn_load_operand"))] "" - {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack] - [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1 - [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1 - [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1 - [SD ,RB ;smem ,* ,12,* ,off] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0) - [&SD ,RB ;smem ,* ,12,* ,on ] ^ - [RB ,Sm ;smem ,* ,12,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0 - [Sm ,RS ;smem ,* ,12,* ,off] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0) - [&Sm ,RS ;smem ,* ,12,* ,on ] ^ - [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dword\t%1, %A0 - [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1 - [Sg ,v ;vop3a,none,8 ,* ,* ] v_readlane_b32\t%0, %1, 0 - [v ,Sv ;vop3a,none,8 ,* ,* ] v_writelane_b32\t%0, %1, 0 - [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1 - [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1 - [a ,a ;vop1 ,* ,4,cdna2,* ] v_accvgpr_mov_b32\t%0, %1 - [v ,RF ;flat ,* ,12,* ,off] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0 - [&v ,RF ;flat ,* ,12,* ,on ] ^ - [^a ,RF ;flat ,* ,12,cdna2,off] ^ - [&^a ,RF ;flat ,* ,12,cdna2,on ] ^ - [RF ,v ;flat ,* ,12,* ,* ] flat_store_dword\t%A0, %1%O0%g0 - [RF ,a ;flat ,* ,12,cdna2,* ] ^ - [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1 - [RLRG,v ;ds ,* ,12,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) - [v ,RLRG;ds ,* ,12,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) - [SD ,Y ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1 - [v ,RM ;flat ,* ,12,* ,off] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - [&v ,RM ;flat ,* ,12,* ,on ] ^ - [^a ,RM ;flat ,* ,12,cdna2,off] ^ - [&^a ,RM ;flat ,* ,12,cdna2,on ] ^ - [RM ,v ;flat ,* ,12,* ,* ] global_store_dword\t%A0, %1%O0%g0 - [RM ,a ;flat ,* ,12,cdna2,* ] ^ + {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess] + [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1 + [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [SD ,RB ;smem ,* ,12,* ,off,* ,* ] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0) + [&SD ,RB ;smem ,* ,12,* ,on ,* ,* ] ^ + [RB ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0 + [Sm ,RS ;smem ,* ,12,* ,off,* ,* ] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + [&Sm ,RS ;smem ,* ,12,* ,on ,* ,* ] ^ + [RS ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_store_dword\t%1, %A0 + [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1 + [Sg ,v ;vop3a,none,8 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0 + [v ,Sv ;vop3a,none,8 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0 + [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1 + [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1 + [a ,a ;vop1 ,* ,4,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1 + [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0 + [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^ + [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^ + [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^ + [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store_dword\t%A0, %1%O0%g0 + [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^ + [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1 + [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) + [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + [SD ,Y ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^ + [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^ + [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^ + [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store_dword\t%A0, %1%O0%g0 + [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^ }) ; 8/16bit move pattern @@ -606,31 +618,31 @@ [(set (match_operand:QIHI 0 "nonimmediate_operand") (match_operand:QIHI 1 "gcn_load_operand"))] "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])" - {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack] - [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1 - [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1 - [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1 - [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1 - [Sg ,v ;vop3a,none,4 ,* ,* ] v_readlane_b32\t%0, %1, 0 - [v ,Sv ;vop3a,none,4 ,* ,* ] v_writelane_b32\t%0, %1, 0 - [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1 - [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1 - [a ,a ;vop1 ,* ,8,cdna2,* ] v_accvgpr_mov_b32\t%0, %1 - [v ,RF ;flat ,* ,12,* ,off] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0 - [&v ,RF ;flat ,* ,12,* ,on ] ^ - [^a ,RF ;flat ,* ,12,cdna2,off] ^ - [&^a ,RF ;flat ,* ,12,cdna2,on ] ^ - [RF ,v ;flat ,* ,12,* ,* ] flat_store%s0\t%A0, %1%O0%g0 - [RF ,a ;flat ,* ,12,cdna2,* ] ^ - [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1 - [RLRG,v ;ds ,* ,12,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) - [v ,RLRG;ds ,* ,12,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) - [v ,RM ;flat ,* ,12,* ,off] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - [&v ,RM ;flat ,* ,12,* ,on ] ^ - [^a ,RM ;flat ,* ,12,cdna2,off] ^ - [&^a ,RM ;flat ,* ,12,cdna2,on ] ^ - [RM ,v ;flat ,* ,12,* ,* ] global_store%s0\t%A0, %1%O0%g0 - [RM ,a ;flat ,* ,12,cdna2,* ] ^ + {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess] + [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1 + [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1 + [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1 + [Sg ,v ;vop3a,none,4 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0 + [v ,Sv ;vop3a,none,4 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0 + [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1 + [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1 + [a ,a ;vop1 ,* ,8,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1 + [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0 + [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^ + [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^ + [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^ + [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store%s0\t%A0, %1%O0%g0 + [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^ + [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1 + [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) + [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^ + [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^ + [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^ + [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store%s0\t%A0, %1%O0%g0 + [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^ }) ; 64bit move pattern @@ -639,34 +651,34 @@ [(set (match_operand:DIDF 0 "nonimmediate_operand") (match_operand:DIDF 1 "general_operand"))] "GET_CODE(operands[1]) != SYMBOL_REF" - {@ [cons: =0, 1; attrs: type, length, cdna, xnack] - [SD ,SSA ;sop1 ,4 ,* ,* ] s_mov_b64\t%0, %1 - [SD ,C ;sop1 ,8 ,* ,* ] ^ - [SD ,DB ;mult ,* ,* ,* ] # - [RS ,Sm ;smem ,12,* ,* ] s_store_dwordx2\t%1, %A0 - [Sm ,RS ;smem ,12,* ,off] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0) - [&Sm ,RS ;smem ,12,* ,on ] ^ - [v ,v ;vmult,* ,* ,* ] # - [v ,DB ;vmult,* ,* ,* ] # - [Sg ,v ;vmult,* ,* ,* ] # - [v ,Sv ;vmult,* ,* ,* ] # - [v ,^a ;vmult,* ,* ,* ] # - [a ,v ;vmult,* ,* ,* ] # - [a ,a ;vmult,* ,cdna2,* ] # - [v ,RF ;flat ,12,* ,off] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0 - [&v ,RF ;flat ,12,* ,on ] ^ - [^a ,RF ;flat ,12,cdna2,off] ^ - [&^a ,RF ;flat ,12,cdna2,on ] ^ - [RF ,v ;flat ,12,* ,* ] flat_store_dwordx2\t%A0, %1%O0%g0 - [RF ,a ;flat ,12,cdna2,* ] ^ - [RLRG,v ;ds ,12,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) - [v ,RLRG;ds ,12,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) - [v ,RM ;flat ,12,* ,off] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - [&v ,RM ;flat ,12,* ,on ] ^ - [^a ,RM ;flat ,12,cdna2,off] ^ - [&^a ,RM ;flat ,12,cdna2,on ] ^ - [RM ,v ;flat ,12,* ,* ] global_store_dwordx2\t%A0, %1%O0%g0 - [RM ,a ;flat ,12,cdna2,* ] ^ + {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess] + [SD ,SSA ;sop1 ,4 ,* ,* ,* ] s_mov_b64\t%0, %1 + [SD ,C ;sop1 ,8 ,* ,* ,* ] ^ + [SD ,DB ;mult ,* ,* ,* ,* ] # + [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx2\t%1, %A0 + [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + [&Sm ,RS ;smem ,12,* ,on ,* ] ^ + [v ,v ;vmult,* ,* ,* ,* ] # + [v ,DB ;vmult,* ,* ,* ,* ] # + [Sg ,v ;vmult,* ,* ,* ,* ] # + [v ,Sv ;vmult,* ,* ,* ,* ] # + [v ,^a ;vmult,* ,* ,* ,* ] # + [a ,v ;vmult,* ,* ,* ,* ] # + [a ,a ;vmult,* ,cdna2,* ,* ] # + [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0 + [&v ,RF ;flat ,12,* ,on ,load ] ^ + [^a ,RF ;flat ,12,cdna2,off,load ] ^ + [&^a ,RF ;flat ,12,cdna2,on ,load ] ^ + [RF ,v ;flat ,12,* ,* ,store] flat_store_dwordx2\t%A0, %1%O0%g0 + [RF ,a ;flat ,12,cdna2,* ,store] ^ + [RLRG,v ;ds ,12,* ,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) + [v ,RLRG;ds ,12,* ,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + [&v ,RM ;flat ,12,* ,on ,load ] ^ + [^a ,RM ;flat ,12,cdna2,off,load ] ^ + [&^a ,RM ;flat ,12,cdna2,on ,load ] ^ + [RM ,v ;flat ,12,* ,* ,store] global_store_dwordx2\t%A0, %1%O0%g0 + [RM ,a ;flat ,12,cdna2,* ,store] ^ } "reload_completed && ((!MEM_P (operands[0]) && !MEM_P (operands[1]) @@ -704,31 +716,31 @@ [(set (match_operand:TI 0 "nonimmediate_operand") (match_operand:TI 1 "general_operand" ))] "" - {@ [cons: =0, 1; attrs: type, delayeduse, length, cdna, xnack] - [SD ,SSB;mult ,* ,* ,* ,* ] # - [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dwordx4\t%1, %A0 - [Sm ,RS ;smem ,yes,12,* ,off] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0) - [&Sm,RS ;smem ,yes,12,* ,on ] ^ - [RF ,v ;flat ,* ,12,* ,* ] flat_store_dwordx4\t%A0, %1%O0%g0 - [RF ,a ;flat ,* ,12,cdna2,* ] ^ - [v ,RF ;flat ,* ,12,* ,off] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0 - [&v ,RF ;flat ,* ,12,* ,on ] ^ - [^a ,RF ;flat ,* ,12,cdna2,off] ^ - [&^a,RF ;flat ,* ,12,cdna2,on ] ^ - [v ,v ;vmult,* ,* ,* ,* ] # - [v ,Sv ;vmult,* ,* ,* ,* ] # - [SD ,v ;vmult,* ,* ,* ,* ] # - [RM ,v ;flat ,yes,12,* ,* ] global_store_dwordx4\t%A0, %1%O0%g0 - [RM ,a ;flat ,yes,12,cdna2,* ] ^ - [v ,RM ;flat ,* ,12,* ,off] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - [&v ,RM ;flat ,* ,12,* ,on ] ^ - [^a ,RM ;flat ,* ,12,cdna2,off] ^ - [&^a,RM ;flat ,* ,12,cdna2,on ] ^ - [RL ,v ;ds ,* ,12,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) - [v ,RL ;ds ,* ,12,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) - [v ,^a ;vmult,* ,* ,* ,* ] # - [a ,v ;vmult,* ,* ,* ,* ] # - [a ,a ;vmult,* ,* ,cdna2,* ] # + {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess] + [SD ,SSB;mult ,* ,* ,* ,* ] # + [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx4\t%1, %A0 + [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0) + [&Sm,RS ;smem ,12,* ,on ,* ] ^ + [RF ,v ;flat ,12,* ,* ,storex34] flat_store_dwordx4\t%A0, %1%O0%g0 + [RF ,a ;flat ,12,cdna2,* ,storex34] ^ + [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0 + [&v ,RF ;flat ,12,* ,on ,load ] ^ + [^a ,RF ;flat ,12,cdna2,off,load ] ^ + [&^a,RF ;flat ,12,cdna2,on ,load ] ^ + [v ,v ;vmult,* ,* ,* ,* ] # + [v ,Sv ;vmult,* ,* ,* ,* ] # + [SD ,v ;vmult,* ,* ,* ,* ] # + [RM ,v ;flat ,12,* ,* ,storex34] global_store_dwordx4\t%A0, %1%O0%g0 + [RM ,a ;flat ,12,cdna2,* ,storex34] ^ + [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) + [&v ,RM ;flat ,12,* ,on ,load ] ^ + [^a ,RM ;flat ,12,cdna2,off,load ] ^ + [&^a,RM ;flat ,12,cdna2,on ,load ] ^ + [RL ,v ;ds ,12,* ,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) + [v ,RL ;ds ,12,* ,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) + [v ,^a ;vmult,* ,* ,* ,* ] # + [a ,v ;vmult,* ,* ,* ,* ] # + [a ,a ;vmult,* ,cdna2,* ,* ] # } "reload_completed && REG_P (operands[0]) @@ -1985,6 +1997,7 @@ flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 %G2\;s_waitcnt\t0 global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,atomicwait,atomicwait") (set_attr "length" "12")]) ; FIXME: These patterns are disabled because the instructions don't @@ -2006,6 +2019,7 @@ flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0 global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,atomicwait,atomicwait") (set_attr "length" "12")]) (define_mode_attr x2 [(SI "DI") (DI "TI")]) @@ -2053,7 +2067,7 @@ global_atomic_cmpswap<X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") (set_attr "length" "12") - (set_attr "delayeduse" "*,yes,yes")]) + (set_attr "flatmemaccess" "*,cmpswapx2,cmpswapx2")]) (define_insn "sync_compare_and_swap<mode>_lds_insn" [(set (match_operand:SIDI 0 "register_operand" "= v") @@ -2173,6 +2187,7 @@ gcc_unreachable (); } [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,load,load") (set_attr "length" "28") (set_attr "rdna" "no,*,*")]) @@ -2257,6 +2272,7 @@ gcc_unreachable (); } [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,store,store") (set_attr "length" "28") (set_attr "rdna" "no,*,*")]) @@ -2389,6 +2405,7 @@ gcc_unreachable (); } [(set_attr "type" "smem,flat,flat") + (set_attr "flatmemaccess" "*,atomicwait,atomicwait") (set_attr "length" "28") (set_attr "rdna" "no,*,*")]) |