diff options
-rw-r--r-- | gcc/ChangeLog.omp | 29 | ||||
-rw-r--r-- | gcc/DATESTAMP.omp | 2 | ||||
-rw-r--r-- | gcc/config/gcn/gcn-valu.md | 313 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.cc | 124 | ||||
-rw-r--r-- | gcc/config/gcn/gcn.md | 28 | ||||
-rw-r--r-- | gcc/doc/tm.texi | 9 | ||||
-rw-r--r-- | gcc/doc/tm.texi.in | 2 | ||||
-rw-r--r-- | gcc/hooks.cc | 7 | ||||
-rw-r--r-- | gcc/hooks.h | 1 | ||||
-rw-r--r-- | gcc/optc-save-gen.awk | 19 | ||||
-rw-r--r-- | gcc/params.opt | 6 | ||||
-rw-r--r-- | gcc/target.def | 14 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog.omp | 16 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.cc | 41 | ||||
-rw-r--r-- | libgomp/ChangeLog.omp | 32 |
15 files changed, 535 insertions, 108 deletions
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index 69162ef..4b23eb1 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,32 @@ +2025-07-21 Thomas Schwinge <tschwinge@baylibre.com> + + Backported from master: + 2025-05-12 Thomas Schwinge <tschwinge@baylibre.com> + + * config/nvptx/nvptx-sm.def: Add '61'. + * config/nvptx/nvptx-gen.h: Regenerate. + * config/nvptx/nvptx-gen.opt: Likewise. + * config/nvptx/nvptx.cc (first_ptx_version_supporting_sm): Adjust. + * config/nvptx/nvptx.opt (-march-map=sm_61, -march-map=sm_62): + Likewise. + * config.gcc: Likewise. + * doc/invoke.texi (Nvidia PTX Options): Document '-march=sm_61'. + * config/nvptx/gen-multilib-matches-tests: Extend. + +2025-07-21 Thomas Schwinge <tschwinge@baylibre.com> + + Backported from master: + 2025-05-12 Thomas Schwinge <tschwinge@baylibre.com> + + * config/nvptx/nvptx-opts.h (enum ptx_version): Add + 'PTX_VERSION_5_0'. + * config/nvptx/nvptx.cc (ptx_version_to_string) + (ptx_version_to_number): Adjust. + * config/nvptx/nvptx.h (TARGET_PTX_5_0): New. + * config/nvptx/nvptx.opt (Enum(ptx_version)): Add 'EnumValue' + '5.0' for 'PTX_VERSION_5_0'. + * doc/invoke.texi (Nvidia PTX Options): Document '-mptx=5.0'. + 2025-06-17 Tobias Burnus <tburnus@baylibre.com> Backported from master: diff --git a/gcc/DATESTAMP.omp b/gcc/DATESTAMP.omp index 6952979..7578d89 100644 --- a/gcc/DATESTAMP.omp +++ b/gcc/DATESTAMP.omp @@ -1 +1 @@ -20250703 +20250722 diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 4b21302..3899117 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -1133,6 +1133,23 @@ DONE; }) +(define_expand "gather_load<mode><vndi>" + [(match_operand:V_MOV 0 "register_operand") + (match_operand:DI 1 "register_operand") + (match_operand:<VnDI> 2 "register_operand") + (match_operand 3 "immediate_operand") + (match_operand:SI 4 "gcn_alu_operand")] + "" + { + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1], + operands[2], operands[4], + INTVAL (operands[3]), NULL); + + emit_insn (gen_gather<mode>_insn_1offset (operands[0], addr, const0_rtx, + const0_rtx, const0_rtx)); + DONE; + }) + ; Allow any address expression (define_expand "gather<mode>_expr<exec>" [(set (match_operand:V_MOV 0 "register_operand") @@ -1259,6 +1276,23 @@ DONE; }) +(define_expand "scatter_store<mode><vndi>" + [(match_operand:DI 0 "register_operand") + (match_operand:<VnDI> 1 "register_operand") + (match_operand 2 "immediate_operand") + (match_operand:SI 3 "gcn_alu_operand") + (match_operand:V_MOV 4 "register_operand")] + "" + { + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0], + operands[1], operands[3], + INTVAL (operands[2]), NULL); + + emit_insn (gen_scatter<mode>_insn_1offset (addr, const0_rtx, operands[4], + const0_rtx, const0_rtx)); + DONE; + }) + ; Allow any address expression (define_expand "scatter<mode>_expr<exec_scatter>" [(set (mem:BLK (scratch)) @@ -1455,28 +1489,26 @@ ;; }}} ;; {{{ ALU special case: add/sub -(define_insn "add<mode>3<exec_clobber>" +(define_insn "add<mode>3<exec>" [(set (match_operand:V_INT_1REG 0 "register_operand") (plus:V_INT_1REG (match_operand:V_INT_1REG 1 "register_operand") - (match_operand:V_INT_1REG 2 "gcn_alu_operand"))) - (clobber (reg:DI VCC_REG))] + (match_operand:V_INT_1REG 2 "gcn_alu_operand")))] "" {@ [cons: =0, %1, 2; attrs: type, length] - [v,v,vSvA;vop2,4] v_add_co_u32\t%0, vcc, %2, %1 + [v,v,vSvA;vop2,4] {v_add_u32|v_add_nc_u32}\t%0, %2, %1 [v,v,vSvB;vop2,8] ^ }) -(define_insn "add<mode>3_dup<exec_clobber>" +(define_insn "add<mode>3_dup<exec>" [(set (match_operand:V_INT_1REG 0 "register_operand") (plus:V_INT_1REG (vec_duplicate:V_INT_1REG (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand")) - (match_operand:V_INT_1REG 1 "register_operand"))) - (clobber (reg:DI VCC_REG))] + (match_operand:V_INT_1REG 1 "register_operand")))] "" {@ [cons: =0, 1, 2; attrs: type, length] - [v,v,SvA;vop2,4] v_add_co_u32\t%0, vcc, %2, %1 + [v,v,SvA;vop2,4] {v_add_u32|v_add_nc_u32}\t%0, %2, %1 [v,v,SvB;vop2,8] ^ }) @@ -1503,16 +1535,16 @@ (plus:V_SI (vec_duplicate:V_SI (match_operand:SI 1 "gcn_alu_operand")) - (match_operand:V_SI 2 "register_operand"))) + (match_operand:V_SI 2 "gcn_alu_operand"))) (set (match_operand:DI 3 "register_operand") - (ltu:DI (plus:V_SI (vec_duplicate:V_SI (match_dup 2)) - (match_dup 1)) - (vec_duplicate:V_SI (match_dup 2))))] + (ltu:DI (plus:V_SI (vec_duplicate:V_SI (match_dup 1)) + (match_dup 2)) + (match_dup 2)))] "" {@ [cons: =0, 1, 2, =3; attrs: type, length] - [v,SvA,v,cV;vop2 ,4] v_add_co_u32\t%0, %3, %1, %2 - [v,SvB,v,cV;vop2 ,8] ^ - [v,SvA,v,Sg;vop3b,8] ^ + [v,SvA,vA,cV;vop2 ,4] v_add_co_u32\t%0, %3, %1, %2 + [v,SvB,vA,cV;vop2 ,8] ^ + [v,SvA,vA,Sg;vop3b,8] ^ }) ; v_addc does not accept an SGPR because the VCC read already counts as an @@ -1551,16 +1583,15 @@ [(set_attr "type" "vop2,vop3b") (set_attr "length" "4,8")]) -(define_insn "sub<mode>3<exec_clobber>" +(define_insn "sub<mode>3<exec>" [(set (match_operand:V_INT_1REG 0 "register_operand" "= v, v") (minus:V_INT_1REG (match_operand:V_INT_1REG 1 "gcn_alu_operand" "vSvB, v") - (match_operand:V_INT_1REG 2 "gcn_alu_operand" " v,vSvB"))) - (clobber (reg:DI VCC_REG))] + (match_operand:V_INT_1REG 2 "gcn_alu_operand" " v,vSvB")))] "" "@ - v_sub_co_u32\t%0, vcc, %1, %2 - v_subrev_co_u32\t%0, vcc, %2, %1" + {v_sub_u32|v_sub_nc_u32}\t%0, %1, %2 + {v_subrev_u32|v_subrev_nc_u32}\t%0, %2, %1" [(set_attr "type" "vop2") (set_attr "length" "8,8")]) @@ -1648,6 +1679,39 @@ [(set_attr "type" "vmult") (set_attr "length" "8")]) +(define_insn_and_split "add<mode>3_dup" + [(set (match_operand:V_DI 0 "register_operand" "= v") + (plus:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "register_operand" "SvB")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDb"))) + (clobber (reg:DI VCC_REG)) + (clobber (match_scratch:<VnSI> 3 "=&v"))] + "" + "#" + "gcn_can_split_p (<MODE>mode, operands[0]) + && gcn_can_split_p (<MODE>mode, operands[1]) + && gcn_can_split_p (<MODE>mode, operands[2])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_add<vnsi>3_vcc_dup + (gcn_operand_part (<MODE>mode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (<MODE>mode, operands[2], 0), + vcc)); + emit_insn (gen_vec_duplicate<vnsi> (operands[3], + gcn_operand_part (DImode, operands[1], 1))); + emit_insn (gen_addc<vnsi>3 + (gcn_operand_part (<MODE>mode, operands[0], 1), + operands[3], + gcn_operand_part (<MODE>mode, operands[2], 1), + vcc, vcc)); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + (define_insn_and_split "add<mode>3_exec" [(set (match_operand:V_DI 0 "register_operand" "= v") (vec_merge:V_DI @@ -1685,6 +1749,49 @@ [(set_attr "type" "vmult") (set_attr "length" "8")]) +(define_insn_and_split "add<mode>3_dup_exec" + [(set (match_operand:V_DI 0 "register_operand" "= v") + (vec_merge:V_DI + (plus:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "register_operand" "SvB")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDb")) + (match_operand:V_DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (reg:DI VCC_REG)) + (clobber (match_scratch:<VnSI> 5 "=&v"))] + "" + "#" + "gcn_can_split_p (<MODE>mode, operands[0]) + && gcn_can_split_p (<MODE>mode, operands[1]) + && gcn_can_split_p (<MODE>mode, operands[2]) + && gcn_can_split_p (<MODE>mode, operands[4])" + [(const_int 0)] + { + rtx vcc = gen_rtx_REG (DImode, VCC_REG); + emit_insn (gen_add<vnsi>3_vcc_dup_exec + (gcn_operand_part (<MODE>mode, operands[0], 0), + gcn_operand_part (DImode, operands[1], 0), + gcn_operand_part (<MODE>mode, operands[2], 0), + vcc, + gcn_operand_part (<MODE>mode, operands[3], 0), + operands[4])); + emit_insn (gen_vec_duplicate<vnsi>_exec (operands[5], + gcn_operand_part (DImode, operands[1], 1), + gcn_gen_undef (<VnSI>mode), + operands[4])); + emit_insn (gen_addc<vnsi>3_exec + (gcn_operand_part (<MODE>mode, operands[0], 1), + operands[5], + gcn_operand_part (<MODE>mode, operands[2], 1), + vcc, vcc, + gcn_operand_part (<MODE>mode, operands[3], 1), + operands[4])); + DONE; + } + [(set_attr "type" "vmult") + (set_attr "length" "8")]) + (define_insn_and_split "sub<mode>3" [(set (match_operand:V_DI 0 "register_operand" "= v, v") (minus:V_DI @@ -1827,7 +1934,7 @@ (ltu:DI (plus:V_DI (zero_extend:V_DI (vec_duplicate:<VnSI> (match_dup 1))) (match_dup 2)) - (match_dup 1)))] + (match_dup 2)))] "" {@ [cons: =0, 1, 2, =3] [v,ASv,v,&Sg] # @@ -1878,7 +1985,7 @@ (ltu:DI (plus:V_DI (zero_extend:V_DI (vec_duplicate:<VnSI> (match_dup 1))) (match_dup 2)) - (match_dup 1)) + (match_dup 2)) (match_dup 5)))] "" {@ [cons: =0, 1, 2, =3, 4, 5] @@ -1932,7 +2039,7 @@ (ltu:DI (plus:V_DI (zero_extend:V_DI (match_dup 1)) (vec_duplicate:V_DI (match_dup 2))) - (match_dup 1)))] + (vec_duplicate:V_DI (match_dup 2))))] "" {@ [cons: =0, 1, 2, =3] [v,v,DbSv,&cV] # @@ -1981,7 +2088,7 @@ (ltu:DI (plus:V_DI (zero_extend:V_DI (match_dup 1)) (vec_duplicate:V_DI (match_dup 2))) - (match_dup 1)) + (vec_duplicate:V_DI (match_dup 2))) (match_dup 5)))] "" {@ [cons: =0, 1, 2, =3, 4, 5] @@ -2190,6 +2297,22 @@ [(set_attr "type" "vop3a") (set_attr "length" "8")]) +(define_insn "<su>mul<mode>3_highpart_dup<exec>" + [(set (match_operand:V_SI 0 "register_operand" "= v") + (truncate:V_SI + (lshiftrt:<VnDI> + (mult:<VnDI> + (any_extend:<VnDI> + (vec_duplicate:V_SI + (match_operand:SI 1 "gcn_alu_operand" "SvA"))) + (any_extend:<VnDI> + (match_operand:V_SI 2 "gcn_alu_operand" " vA"))) + (const_int 32))))] + "" + "v_mul_hi<sgnsuffix>0\t%0, %2, %1" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + (define_insn "mul<mode>3<exec>" [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") (mult:V_INT_1REG @@ -2201,11 +2324,11 @@ (set_attr "length" "8")]) (define_insn "mul<mode>3_dup<exec>" - [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") + [(set (match_operand:V_INT_1REG 0 "register_operand" "= v") (mult:V_INT_1REG - (match_operand:V_INT_1REG 1 "gcn_alu_operand" "%vSvA") (vec_duplicate:V_INT_1REG - (match_operand:<SCALAR_MODE> 2 "gcn_alu_operand" " SvA"))))] + (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvA")) + (match_operand:V_INT_1REG 2 "gcn_alu_operand" " vA")))] "" "v_mul_lo_u32\t%0, %1, %2" [(set_attr "type" "vop3a") @@ -2241,6 +2364,37 @@ DONE; }) +(define_insn_and_split "mul<mode>3_dup" + [(set (match_operand:V_DI 0 "register_operand" "=&v") + (mult:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "gcn_alu_operand" " Sv")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDA"))) + (clobber (match_scratch:<VnSI> 3 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (<MODE>mode, operands[0], 0); + rtx out_hi = gcn_operand_part (<MODE>mode, operands[0], 1); + rtx left_lo = gcn_operand_part (DImode, operands[1], 0); + rtx left_hi = gcn_operand_part (DImode, operands[1], 1); + rtx right_lo = gcn_operand_part (<MODE>mode, operands[2], 0); + rtx right_hi = gcn_operand_part (<MODE>mode, operands[2], 1); + rtx tmp = operands[3]; + + emit_insn (gen_mul<vnsi>3_dup (out_lo, left_lo, right_lo)); + emit_insn (gen_umul<vnsi>3_highpart_dup (out_hi, left_lo, right_lo)); + emit_insn (gen_mul<vnsi>3_dup (tmp, left_hi, right_lo)); + emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp)); + emit_insn (gen_mul<vnsi>3_dup (tmp, left_lo, right_hi)); + emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp)); + emit_insn (gen_mul<vnsi>3_dup (tmp, left_hi, right_hi)); + emit_insn (gen_add<vnsi>3 (out_hi, out_hi, tmp)); + DONE; + }) + (define_insn_and_split "mul<mode>3_exec" [(set (match_operand:V_DI 0 "register_operand" "=&v") (vec_merge:V_DI @@ -2289,6 +2443,56 @@ DONE; }) +(define_insn_and_split "mul<mode>3_dup_exec" + [(set (match_operand:V_DI 0 "register_operand" "=&v") + (vec_merge:V_DI + (mult:V_DI + (vec_duplicate:V_DI + (match_operand:DI 1 "gcn_alu_operand" " Sv")) + (match_operand:V_DI 2 "gcn_alu_operand" "vDA")) + (match_operand:V_DI 3 "gcn_register_or_unspec_operand" " U0") + (match_operand:DI 4 "gcn_exec_reg_operand" " e"))) + (clobber (match_scratch:<VnSI> 5 "=&v"))] + "" + "#" + "reload_completed" + [(const_int 0)] + { + rtx out_lo = gcn_operand_part (<MODE>mode, operands[0], 0); + rtx out_hi = gcn_operand_part (<MODE>mode, operands[0], 1); + rtx left_lo = gcn_operand_part (DImode, operands[1], 0); + rtx left_hi = gcn_operand_part (DImode, operands[1], 1); + rtx right_lo = gcn_operand_part (<MODE>mode, operands[2], 0); + rtx right_hi = gcn_operand_part (<MODE>mode, operands[2], 1); + rtx exec = operands[4]; + rtx tmp = operands[5]; + + rtx old_lo, old_hi; + if (GET_CODE (operands[3]) == UNSPEC) + { + old_lo = old_hi = gcn_gen_undef (<VnSI>mode); + } + else + { + old_lo = gcn_operand_part (<MODE>mode, operands[3], 0); + old_hi = gcn_operand_part (<MODE>mode, operands[3], 1); + } + + rtx undef = gcn_gen_undef (<VnSI>mode); + + emit_insn (gen_mul<vnsi>3_dup_exec (out_lo, left_lo, right_lo, old_lo, + exec)); + emit_insn (gen_umul<vnsi>3_highpart_dup_exec (out_hi, left_lo, right_lo, + old_hi, exec)); + emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_hi, right_lo, undef, exec)); + emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec)); + emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_lo, right_hi, undef, exec)); + emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec)); + emit_insn (gen_mul<vnsi>3_dup_exec (tmp, left_hi, right_hi, undef, exec)); + emit_insn (gen_add<vnsi>3_exec (out_hi, out_hi, tmp, out_hi, exec)); + DONE; + }) + (define_insn_and_split "mul<mode>3_zext" [(set (match_operand:V_DI 0 "register_operand" "=&v") (mult:V_DI @@ -3795,9 +3999,9 @@ /* Unsigned comparisons use the same patterns as signed comparisons, except that they use unsigned operators (e.g. LTU vs LT). The '%E1' directive then does the Right Thing. */ - emit_insn (gen_vec_cmpu<mode>di_exec (operands[0], operands[1], - operands[2], operands[3], - operands[4])); + emit_insn (gen_vec_cmp<mode>di_exec (operands[0], operands[1], + operands[2], operands[3], + operands[4])); DONE; }) @@ -4052,6 +4256,32 @@ DONE; }) +(define_expand "mask_gather_load<mode><vndi>" + [(set:V_MOV (match_operand:V_MOV 0 "register_operand") + (unspec:V_MOV + [(match_operand:DI 1 "register_operand") + (match_operand:<VnDI> 2 "register_operand") + (match_operand 3 "immediate_operand") + (match_operand:SI 4 "gcn_alu_operand") + (match_operand:DI 5 "") + (match_operand:V_MOV 6 "maskload_else_operand")] + UNSPEC_GATHER))] + "" + { + rtx exec = force_reg (DImode, operands[5]); + + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[1], + operands[2], operands[4], + INTVAL (operands[3]), exec); + + emit_insn (gen_gather<mode>_insn_1offset_exec (operands[0], addr, + const0_rtx, const0_rtx, + const0_rtx, + gcn_gen_undef (<MODE>mode), + exec)); + DONE; + }) + (define_expand "mask_scatter_store<mode><vnsi>" [(match_operand:DI 0 "register_operand") (match_operand:<VnSI> 1 "register_operand") @@ -4080,6 +4310,27 @@ DONE; }) +(define_expand "mask_scatter_store<mode><vndi>" + [(match_operand:DI 0 "register_operand") + (match_operand:<VnDI> 1 "register_operand") + (match_operand 2 "immediate_operand") + (match_operand:SI 3 "gcn_alu_operand") + (match_operand:V_MOV 4 "register_operand") + (match_operand:DI 5 "")] + "" + { + rtx exec = force_reg (DImode, operands[5]); + + rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0], + operands[1], operands[3], + INTVAL (operands[2]), exec); + + emit_insn (gen_scatter<mode>_insn_1offset_exec (addr, const0_rtx, + operands[4], const0_rtx, + const0_rtx, exec)); + DONE; + }) + (define_code_iterator cond_op [plus minus mult]) (define_expand "cond_<expander><mode>" @@ -4400,7 +4651,7 @@ rtx tmp = gen_reg_rtx (<MODE>mode); rtx v1 = gen_rtx_REG (<MODE>mode, VGPR_REGNO (1)); - emit_insn (gen_mul<mode>3_dup (tmp, v1, operands[2])); + emit_insn (gen_mul<mode>3_dup (tmp, operands[2], v1)); emit_insn (gen_add<mode>3_dup (operands[0], tmp, operands[1])); DONE; }) diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 9b882d9..a6f3731 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -54,6 +54,7 @@ #include "gimple.h" #include "cgraph.h" #include "case-cfn-macros.h" +#include "opts.h" /* This file should be included last. */ #include "target-def.h" @@ -183,6 +184,11 @@ gcn_option_override (void) if (flag_sram_ecc == HSACO_ATTR_DEFAULT) flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default; + + /* TODO: This seems to produce tighter loops, but the testsuites expects it + to be set to '2', so I'll leave it default for now. + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + param_vect_partial_vector_usage, 1); */ } /* }}} */ @@ -1276,13 +1282,13 @@ gen_##PREFIX##vN##SUFFIX (PARAMS) \ } #define GEN_VNM_NOEXEC(PREFIX, SUFFIX, PARAMS, ARGS) \ -GEN_VN_NOEXEC (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN_NOEXEC (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN_NOEXEC (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS))) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS))) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS))) \ GEN_VN_NOEXEC (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN_NOEXEC (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS))) \ GEN_VN_NOEXEC (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN_NOEXEC (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN_NOEXEC (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS))) \ static rtx \ gen_##PREFIX##vNm##SUFFIX (PARAMS) \ { \ @@ -1290,13 +1296,13 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS) \ \ switch (mode) \ { \ - case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS); \ - case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS); \ - case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS); \ + USE_QHF (case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS);) \ + USE_QHF (case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS);) \ + USE_QHF (case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS);) \ case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS); \ - case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS); \ + USE_QHF (case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS);) \ case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS); \ - case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS); \ + USE_QHF (case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS);) \ default: \ break; \ } \ @@ -1341,13 +1347,13 @@ gen_##PREFIX##vN##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \ } #define GEN_VNM(PREFIX, SUFFIX, PARAMS, ARGS) \ -GEN_VN (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS))) \ +USE_QHF (GEN_VN (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS))) \ +USE_QHF (GEN_VN (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS))) \ GEN_VN (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS))) \ GEN_VN (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \ -GEN_VN (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \ +USE_QHF (GEN_VN (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS))) \ USE_TI (GEN_VN (PREFIX, ti##SUFFIX, A(PARAMS), A(ARGS))) \ static rtx \ gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \ @@ -1356,15 +1362,22 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \ \ switch (mode) \ { \ - case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS, merge_src, exec); \ - case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS, merge_src, exec); \ - case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS, merge_src, exec); \ - case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS, merge_src, exec); \ - case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS, merge_src, exec); \ - case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS, merge_src, exec); \ - case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS, merge_src, exec); \ - case E_TImode: \ - USE_TI (return gen_##PREFIX##vNti##SUFFIX (ARGS, merge_src, exec);) \ + USE_QHF (case E_QImode: \ + return gen_##PREFIX##vNqi##SUFFIX (ARGS, merge_src, exec);) \ + USE_QHF (case E_HImode: \ + return gen_##PREFIX##vNhi##SUFFIX (ARGS, merge_src, exec);) \ + USE_QHF (case E_HFmode: \ + return gen_##PREFIX##vNhf##SUFFIX (ARGS, merge_src, exec);) \ + case E_SImode: \ + return gen_##PREFIX##vNsi##SUFFIX (ARGS, merge_src, exec); \ + USE_QHF (case E_SFmode: \ + return gen_##PREFIX##vNsf##SUFFIX (ARGS, merge_src, exec);) \ + case E_DImode: \ + return gen_##PREFIX##vNdi##SUFFIX (ARGS, merge_src, exec); \ + USE_QHF (case E_DFmode: \ + return gen_##PREFIX##vNdf##SUFFIX (ARGS, merge_src, exec);) \ + USE_TI (case E_TImode: \ + return gen_##PREFIX##vNti##SUFFIX (ARGS, merge_src, exec);) \ default: \ break; \ } \ @@ -1373,7 +1386,8 @@ gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \ return NULL_RTX; \ } -/* These have TImode support. */ +/* These support everything. */ +#define USE_QHF(ARGS) ARGS #define USE_TI(ARGS) ARGS GEN_VNM (mov,, A(rtx dest, rtx src), A(dest, src)) GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src)) @@ -1383,6 +1397,7 @@ GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src)) #define USE_TI(ARGS) GEN_VNM (add,3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN (add,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) +GEN_VN (add,di3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN (add,si3_vcc_dup, A(rtx dest, rtx src1, rtx src2, rtx vcc), A(dest, src1, src2, vcc)) GEN_VN (add,di3_sext_dup2, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) @@ -1394,15 +1409,20 @@ GEN_VN (add,di3_vcc_zext_dup2, A(rtx dest, rtx src1, rtx src2, rtx vcc), GEN_VN (addc,si3, A(rtx dest, rtx src1, rtx src2, rtx vccout, rtx vccin), A(dest, src1, src2, vccout, vccin)) GEN_VN (and,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) -GEN_VN (ashl,si3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift)) GEN_VNM_NOEXEC (ds_bpermute,, A(rtx dest, rtx addr, rtx src, rtx exec), A(dest, addr, src, exec)) GEN_VNM (gather,_expr, A(rtx dest, rtx addr, rtx as, rtx vol), A(dest, addr, as, vol)) -GEN_VN (mul,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN (sub,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) GEN_VN_NOEXEC (vec_series,si, A(rtx dest, rtx x, rtx c), A(dest, x, c)) +/* These do not have QI, HI, or any FP support. */ +#undef USE_QHF +#define USE_QHF(ARGS) +GEN_VNM (ashl,3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift)) +GEN_VNM (mul,3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2)) + +#undef USE_QHF #undef USE_TI #undef GEN_VNM #undef GEN_VN @@ -1996,8 +2016,8 @@ gcn_expand_vector_init (rtx op0, rtx vec) rtx addr = gen_reg_rtx (addrmode); int unit_size = GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op0))); - emit_insn (gen_mulvNsi3_dup (ramp, gen_rtx_REG (offsetmode, VGPR_REGNO (1)), - GEN_INT (unit_size))); + emit_insn (gen_mulvNsi3_dup (ramp, GEN_INT (unit_size), + gen_rtx_REG (offsetmode, VGPR_REGNO (1)))); bool simple_repeat = true; @@ -2294,36 +2314,46 @@ gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem, Return values. ADDR_SPACE_FLAT - return VnDImode vector of absolute addresses. - ADDR_SPACE_GLOBAL - return VnSImode vector of offsets. */ + ADDR_SPACE_GLOBAL - return VnSImode vector of offsets. + 64-bit offsets - return VnDImode vector of absolute addresses. */ rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale, bool unsigned_p, rtx exec) { int vf = GET_MODE_NUNITS (GET_MODE (offsets)); - rtx tmpsi = gen_reg_rtx (VnMODE (vf, SImode)); - rtx tmpdi = gen_reg_rtx (VnMODE (vf, DImode)); + rtx scaled_offsets = gen_reg_rtx (GET_MODE (offsets)); + rtx abs_addr = gen_reg_rtx (VnMODE (vf, DImode)); + bool use_di = GET_MODE_INNER (GET_MODE (scaled_offsets)) == DImode; if (CONST_INT_P (scale) && INTVAL (scale) > 0 && exact_log2 (INTVAL (scale)) >= 0) - emit_insn (gen_ashlvNsi3 (tmpsi, offsets, - GEN_INT (exact_log2 (INTVAL (scale))), - NULL, exec)); + emit_insn (gen_ashlvNm3 (scaled_offsets, offsets, + GEN_INT (exact_log2 (INTVAL (scale))), + NULL, exec)); else - emit_insn (gen_mulvNsi3_dup (tmpsi, offsets, scale, NULL, exec)); + emit_insn (gen_mulvNm3_dup (scaled_offsets, scale, offsets, NULL, exec)); + /* No instructions support DImode offsets. */ + if (use_di) + { + emit_insn (gen_addvNdi3_dup (abs_addr, base, scaled_offsets, NULL, exec)); + return abs_addr; + } /* "Global" instructions do not support negative register offsets. */ - if (as == ADDR_SPACE_FLAT || !unsigned_p) + else if (as == ADDR_SPACE_FLAT || !unsigned_p) { if (unsigned_p) - emit_insn (gen_addvNdi3_zext_dup2 (tmpdi, tmpsi, base, NULL, exec)); + emit_insn (gen_addvNdi3_zext_dup2 (abs_addr, scaled_offsets, base, + NULL, exec)); else - emit_insn (gen_addvNdi3_sext_dup2 (tmpdi, tmpsi, base, NULL, exec)); - return tmpdi; + emit_insn (gen_addvNdi3_sext_dup2 (abs_addr, scaled_offsets, base, + NULL, exec)); + return abs_addr; } else if (as == ADDR_SPACE_GLOBAL) - return tmpsi; + return scaled_offsets; gcc_unreachable (); } @@ -5765,6 +5795,16 @@ gcn_libc_has_function (enum function_class fn_class, return bsd_libc_has_function (fn_class, type); } +/* Implement TARGET_VECTORIZE_PREFER_GATHER_SCATTER. */ + +static bool +gcn_prefer_gather_scatter (machine_mode ARG_UNUSED (mode), + int ARG_UNUSED (scale), + unsigned int ARG_UNUSED (group_size)) +{ + return true; +} + /* }}} */ /* {{{ md_reorg pass. */ @@ -7964,6 +8004,8 @@ gcn_dwarf_register_span (rtx rtl) gcn_vectorize_builtin_vectorized_function #undef TARGET_VECTORIZE_GET_MASK_MODE #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode +#undef TARGET_VECTORIZE_PREFER_GATHER_SCATTER +#define TARGET_VECTORIZE_PREFER_GATHER_SCATTER gcn_prefer_gather_scatter #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 1998931..abb1850 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -1136,14 +1136,13 @@ [(set (match_operand:SI 0 "register_operand" "= Sg, Sg, Sg, v") (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA, v") (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ, B,vBSv"))) - (clobber (match_scratch:BI 3 "= cs, cs, cs, X")) - (clobber (match_scratch:DI 4 "= X, X, X, cV"))] + (clobber (match_scratch:BI 3 "= cs, cs, cs, X"))] "" "@ s_add_i32\t%0, %1, %2 s_addk_i32\t%0, %2 s_add_i32\t%0, %1, %2 - v_add_co_u32\t%0, vcc, %2, %1" + {v_add_u32|v_add_nc_u32}\t%0, %2, %1" [(set_attr "type" "sop2,sopk,sop2,vop2") (set_attr "length" "4,4,8,8")]) @@ -1151,8 +1150,7 @@ [(parallel [(set (match_operand:SI 0 "register_operand") (plus:SI (match_operand:SI 1 "gcn_alu_operand") (match_operand:SI 2 "gcn_alu_operand"))) - (clobber (reg:BI SCC_REG)) - (clobber (scratch:DI))])] + (clobber (reg:BI SCC_REG))])] "" {}) @@ -1332,14 +1330,13 @@ [(set (match_operand:SI 0 "register_operand" "=Sg, Sg, v, v") (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA, v,vBSv") (match_operand:SI 2 "gcn_alu_operand" "SgA, B, vBSv, v"))) - (clobber (match_scratch:BI 3 "=cs, cs, X, X")) - (clobber (match_scratch:DI 4 "= X, X, cV, cV"))] + (clobber (match_scratch:BI 3 "=cs, cs, X, X"))] "" "@ s_sub_i32\t%0, %1, %2 s_sub_i32\t%0, %1, %2 - v_subrev_co_u32\t%0, vcc, %2, %1 - v_sub_co_u32\t%0, vcc, %1, %2" + {v_subrev_u32|v_subrev_nc_u32}\t%0, %2, %1 + {v_sub_u32|v_sub_nc_u32}\t%0, %1, %2" [(set_attr "type" "sop2,sop2,vop2,vop2") (set_attr "length" "4,8,8,8")]) @@ -1569,8 +1566,7 @@ (mult:DI (match_operand:DI 1 "register_operand" "%Sg, Sg, v, v") (match_operand:DI 2 "nonmemory_operand" "Sg, i,vSv, A"))) (clobber (match_scratch:SI 3 "=&Sg,&Sg,&v,&v")) - (clobber (match_scratch:BI 4 "=cs, cs, X, X")) - (clobber (match_scratch:DI 5 "=X, X,cV,cV"))] + (clobber (match_scratch:BI 4 "=cs, cs, X, X"))] "" "#" "reload_completed" @@ -1585,15 +1581,13 @@ emit_insn (gen_umulsidi3 (operands[0], op1lo, op2lo)); emit_insn (gen_mulsi3 (tmp, op1lo, op2hi)); rtx add = gen_rtx_SET (dsthi, gen_rtx_PLUS (SImode, dsthi, tmp)); - rtx clob1 = gen_rtx_CLOBBER (VOIDmode, operands[4]); - rtx clob2 = gen_rtx_CLOBBER (VOIDmode, operands[5]); - add = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (3, add, clob1, clob2)); + rtx clob = gen_rtx_CLOBBER (VOIDmode, operands[4]); + add = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, add, clob)); emit_insn (add); emit_insn (gen_mulsi3 (tmp, op1hi, op2lo)); add = gen_rtx_SET (dsthi, gen_rtx_PLUS (SImode, dsthi, tmp)); - clob1 = gen_rtx_CLOBBER (VOIDmode, operands[4]); - clob2 = gen_rtx_CLOBBER (VOIDmode, operands[5]); - add = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (3, add, clob1, clob2)); + clob = gen_rtx_CLOBBER (VOIDmode, operands[4]); + add = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, add, clob)); emit_insn (add); DONE; }) diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index a96700c..ff52c4f 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6515,6 +6515,15 @@ The default is @code{NULL_TREE} which means to not vectorize scatter stores. @end deftypefn +@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFER_GATHER_SCATTER (machine_mode @var{mode}, int @var{scale}, unsigned int @var{group_size}) +This hook returns TRUE if gather loads or scatter stores are cheaper on +this target than a sequence of elementwise loads or stores. The @var{mode} +and @var{scale} correspond to the @code{gather_load} and +@code{scatter_store} instruction patterns. The @var{group_size} is the +number of scalar elements in each scalar loop iteration that are to be +combined into the vector. +@end deftypefn + @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int}, @var{bool}) This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float} fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index eccc4d8..b03ad4c 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4311,6 +4311,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_VECTORIZE_BUILTIN_SCATTER +@hook TARGET_VECTORIZE_PREFER_GATHER_SCATTER + @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN @hook TARGET_SIMD_CLONE_ADJUST diff --git a/gcc/hooks.cc b/gcc/hooks.cc index 951825d..76cb5931 100644 --- a/gcc/hooks.cc +++ b/gcc/hooks.cc @@ -117,6 +117,13 @@ hook_bool_mode_const_rtx_true (machine_mode, const_rtx) return true; } +/* Generic hook that takes (machine_mode, int, unsigned) and returns false. */ +bool +hook_bool_mode_int_unsigned_false (machine_mode, int, unsigned) +{ + return false; +} + /* Generic hook that takes (machine_mode, rtx) and returns false. */ bool hook_bool_mode_rtx_false (machine_mode, rtx) diff --git a/gcc/hooks.h b/gcc/hooks.h index c0663bf..e95bd11 100644 --- a/gcc/hooks.h +++ b/gcc/hooks.h @@ -36,6 +36,7 @@ extern bool hook_bool_mode_true (machine_mode); extern bool hook_bool_mode_mode_true (machine_mode, machine_mode); extern bool hook_bool_mode_const_rtx_false (machine_mode, const_rtx); extern bool hook_bool_mode_const_rtx_true (machine_mode, const_rtx); +extern bool hook_bool_mode_int_unsigned_false (machine_mode, int, unsigned); extern bool hook_bool_mode_rtx_false (machine_mode, rtx); extern bool hook_bool_mode_rtx_true (machine_mode, rtx); extern bool hook_bool_const_rtx_insn_const_rtx_insn_true (const rtx_insn *, diff --git a/gcc/optc-save-gen.awk b/gcc/optc-save-gen.awk index a3d7e5a..31756ec 100644 --- a/gcc/optc-save-gen.awk +++ b/gcc/optc-save-gen.awk @@ -1313,6 +1313,12 @@ for (i = 0; i < n_opts; i++) { # offloading is enabled. if (flag_set_p("Target", flags[i])) var_target_opt[n_opt_val] = 1; + + # These options should not be passed from host to target, but + # are not actually target specific. + if (flag_set_p("NoOffload", flags[i])) + var_target_opt[n_opt_val] = 2; + n_opt_val++; } } @@ -1393,7 +1399,7 @@ for (i = 0; i < n_opt_val; i++) { # Do not stream out target-specific opts if offloading is # enabled. if (var_target_opt[i]) - print " if (!lto_stream_offload_p)" + print " if (!lto_stream_offload_p) {" # If applicable, encode the streamed value. if (var_opt_optimize_init[i]) { print " if (" var_opt_optimize_init[i] " > (" var_opt_val_type[i] ") 10)"; @@ -1403,6 +1409,8 @@ for (i = 0; i < n_opt_val; i++) { } else { print " bp_pack_var_len_" sgn " (bp, ptr->" name");"; } + if (var_target_opt[i]) + print "}" } } print " for (size_t i = 0; i < ARRAY_SIZE (ptr->explicit_mask); i++)"; @@ -1418,10 +1426,14 @@ print " struct cl_optimization *ptr ATTRIBUTE_UNUSED)" print "{"; for (i = 0; i < n_opt_val; i++) { name = var_opt_val[i] - if (var_target_opt[i]) { + if (var_target_opt[i] == 1) { print "#ifdef ACCEL_COMPILER" print "#error accel compiler cannot define Optimization attribute for target-specific option " name; print "#else" + } else if (var_target_opt[i] == 2) { + print "#ifdef ACCEL_COMPILER" + print " ptr->" name " = global_options." name ";" + print "#else" } otype = var_opt_val_type[i]; if (otype ~ "^const char \\**$") { @@ -1489,6 +1501,9 @@ for (i = 0; i < n_opts; i++) { if (flag_set_p("Warning", flags[i])) continue; + if (flag_set_p("NoOffload", flags[i])) + continue; + if (name in checked_options) continue; checked_options[name]++ diff --git a/gcc/params.opt b/gcc/params.opt index a2b606f..28b47ff 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -1226,7 +1226,7 @@ Common Joined UInteger Var(param_use_canonical_types) Init(1) IntegerRange(0, 1) Whether to use canonical types. -param=vect-epilogues-nomask= -Common Joined UInteger Var(param_vect_epilogues_nomask) Init(1) IntegerRange(0, 1) Param Optimization +Common Joined UInteger Var(param_vect_epilogues_nomask) Init(1) IntegerRange(0, 1) Param Optimization NoOffload Enable loop epilogue vectorization using smaller vector size. -param=vect-max-layout-candidates= @@ -1246,11 +1246,11 @@ Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. -param=vect-partial-vector-usage= -Common Joined UInteger Var(param_vect_partial_vector_usage) Init(2) IntegerRange(0, 2) Param Optimization +Common Joined UInteger Var(param_vect_partial_vector_usage) Init(2) IntegerRange(0, 2) Param Optimization NoOffload Controls how loop vectorizer uses partial vectors. 0 means never, 1 means only for loops whose need to iterate can be removed, 2 means for all loops. The default value is 2. -param=vect-inner-loop-cost-factor= -Common Joined UInteger Var(param_vect_inner_loop_cost_factor) Init(50) IntegerRange(1, 10000) Param Optimization +Common Joined UInteger Var(param_vect_inner_loop_cost_factor) Init(50) IntegerRange(1, 10000) Param Optimization NoOffload The maximum factor which the loop vectorizer applies to the cost of statements in an inner loop relative to the loop being vectorized. -param=vect-induction-float= diff --git a/gcc/target.def b/gcc/target.def index 6c7cdc8..c631131 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2056,6 +2056,20 @@ all zeros. GCC can then try to branch around the instruction instead.", (unsigned ifn), default_empty_mask_is_expensive) +/* Prefer gather/scatter loads/stores to e.g. elementwise accesses if\n\ +we cannot use a contiguous access. */ +DEFHOOK +(prefer_gather_scatter, + "This hook returns TRUE if gather loads or scatter stores are cheaper on\n\ +this target than a sequence of elementwise loads or stores. The @var{mode}\n\ +and @var{scale} correspond to the @code{gather_load} and\n\ +@code{scatter_store} instruction patterns. The @var{group_size} is the\n\ +number of scalar elements in each scalar loop iteration that are to be\n\ +combined into the vector.", + bool, + (machine_mode mode, int scale, unsigned int group_size), + hook_bool_mode_int_unsigned_false) + /* Target builtin that implements vector gather operation. */ DEFHOOK (builtin_gather, diff --git a/gcc/testsuite/ChangeLog.omp b/gcc/testsuite/ChangeLog.omp index a7ea0db..a67b8b7 100644 --- a/gcc/testsuite/ChangeLog.omp +++ b/gcc/testsuite/ChangeLog.omp @@ -1,3 +1,19 @@ +2025-07-21 Thomas Schwinge <tschwinge@baylibre.com> + + Backported from master: + 2025-05-12 Thomas Schwinge <tschwinge@baylibre.com> + + * gcc.target/nvptx/march-map=sm_61.c: Adjust. + * gcc.target/nvptx/march-map=sm_62.c: Likewise. + * gcc.target/nvptx/march=sm_61.c: New. + +2025-07-21 Thomas Schwinge <tschwinge@baylibre.com> + + Backported from master: + 2025-05-12 Thomas Schwinge <tschwinge@baylibre.com> + + * gcc.target/nvptx/mptx=5.0.c: New. + 2025-07-03 Thomas Schwinge <tschwinge@baylibre.com> Backported from master: diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 978a462..f09e06b 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1822,21 +1822,35 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info, static bool vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info, + tree vectype, loop_vec_info loop_vinfo, bool masked_p, gather_scatter_info *gs_info, - vec<int> *elsvals) + vec<int> *elsvals, + unsigned int group_size, + bool single_element_p) { if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info, elsvals) || gs_info->ifn == IFN_LAST) - return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo, - masked_p, gs_info, elsvals); + { + if (!vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo, + masked_p, gs_info, elsvals)) + return false; + } + else + { + tree old_offset_type = TREE_TYPE (gs_info->offset); + tree new_offset_type = TREE_TYPE (gs_info->offset_vectype); - tree old_offset_type = TREE_TYPE (gs_info->offset); - tree new_offset_type = TREE_TYPE (gs_info->offset_vectype); + gcc_assert (TYPE_PRECISION (new_offset_type) + >= TYPE_PRECISION (old_offset_type)); + gs_info->offset = fold_convert (new_offset_type, gs_info->offset); + } - gcc_assert (TYPE_PRECISION (new_offset_type) - >= TYPE_PRECISION (old_offset_type)); - gs_info->offset = fold_convert (new_offset_type, gs_info->offset); + if (!single_element_p + && !targetm.vectorize.prefer_gather_scatter (TYPE_MODE (vectype), + gs_info->scale, + group_size)) + return false; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -2397,11 +2411,11 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, allows us to use contiguous accesses. */ if ((*memory_access_type == VMAT_ELEMENTWISE || *memory_access_type == VMAT_STRIDED_SLP) - && single_element_p && (!slp_node || SLP_TREE_LANES (slp_node) == 1) && loop_vinfo - && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo, - masked_p, gs_info, elsvals)) + && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo, + masked_p, gs_info, elsvals, + group_size, single_element_p)) *memory_access_type = VMAT_GATHER_SCATTER; if (*memory_access_type == VMAT_CONTIGUOUS_DOWN @@ -2558,8 +2572,9 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, { gcc_assert (!slp_node); if (loop_vinfo - && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo, - masked_p, gs_info, elsvals)) + && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo, + masked_p, gs_info, elsvals, + 1, false)) *memory_access_type = VMAT_GATHER_SCATTER; else *memory_access_type = VMAT_ELEMENTWISE; diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index 689cfbf..73baf37 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,3 +1,35 @@ +2025-07-21 Thomas Schwinge <tschwinge@baylibre.com> + + Backported from master: + 2025-07-17 Thomas Schwinge <tschwinge@baylibre.com> + + PR target/119692 + * testsuite/libgomp.c++/pr119692-1-4.C: '{ dg-timeout 10 { target offload_device } }'. + * testsuite/libgomp.c++/pr119692-1-5.C: Likewise. + * testsuite/libgomp.c++/target-exceptions-bad_cast-1.C: Likewise. + * testsuite/libgomp.c++/target-exceptions-bad_cast-2.C: Likewise. + * testsuite/libgomp.oacc-c++/exceptions-bad_cast-1.C: Likewise. + * testsuite/libgomp.oacc-c++/exceptions-bad_cast-2.C: Likewise. + +2025-07-21 Thomas Schwinge <tschwinge@baylibre.com> + + Backported from master: + 2025-05-12 Thomas Schwinge <tschwinge@baylibre.com> + + * testsuite/libgomp.c/declare-variant-3-sm61.c: New. + * testsuite/libgomp.c/declare-variant-3.h: Adjust. + +2025-07-21 Thomas Schwinge <tschwinge@baylibre.com> + + Backported from master: + 2025-07-21 Thomas Schwinge <tschwinge@baylibre.com> + + PR target/119853 + PR target/119854 + * testsuite/libgomp.c++/target-cdtor-1.C: Adjust for + 'targetm.cxx.use_aeabi_atexit'. + * testsuite/libgomp.c++/target-cdtor-2.C: Likewise. + 2025-07-03 Jakub Jelinek <jakub@redhat.com> Backported from master: |