From a5e78ee60cd54dcceb9e7cfa42edd0c29c280f5c Mon Sep 17 00:00:00 2001 From: Bin Cheng Date: Mon, 9 Aug 2021 17:21:03 +0800 Subject: aarch64: Expand % correctly according to mode iterator Pattern "*extend2_aarch64" is duplicated from the corresponding zero_extend pattern, however % needs to be expanded according to its mode iterator because the smov instruction is different to umov. 2021-08-09 Bin Cheng gcc/ * config/aarch64/aarch64.md (*extend2_aarch64): Use %0. --- gcc/config/aarch64/aarch64.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index eb8ccd4..7085cd4 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1880,7 +1880,7 @@ "@ sxt\t%0, %w1 ldrs\t%0, %1 - smov\t%w0, %1.[0]" + smov\t%0, %1.[0]" [(set_attr "type" "extend,load_4,neon_to_gp") (set_attr "arch" "*,*,fp")] ) -- cgit v1.1 From 527a1cf32c27a3fbeaf6be7596241570d864cc4c Mon Sep 17 00:00:00 2001 From: Tobias Burnus Date: Mon, 9 Aug 2021 12:35:23 +0200 Subject: testsuite/lib/gfortran.exp: Add -I for ISO*.h [PR101305, PR101660] This patch adds -I$specdir/libgfortran to GFORTRAN_UNDER_TEST, when set by proc gfortran_init. As the $specdir depends on the multilib setting, it has to be re-set for a different multilib; hence, we track whether a previous call to gfortran_init set that var or whether it was set differently. gcc/testsuite/ PR libfortran/101305 PR fortran/101660 * lib/gfortran.exp (gfortran_init): Add -I $specdir/libgfortran to GFORTRAN_UNDER_TEST; update it when set by previous gfortran_init call. * gfortran.dg/ISO_Fortran_binding_1.c: Use <...> not "..." for ISO_Fortran_binding.h's #include. * gfortran.dg/ISO_Fortran_binding_10.c: Likewise. * gfortran.dg/ISO_Fortran_binding_11.c: Likewise. * gfortran.dg/ISO_Fortran_binding_12.c: Likewise. * gfortran.dg/ISO_Fortran_binding_15.c: Likewise. * gfortran.dg/ISO_Fortran_binding_16.c: Likewise. * gfortran.dg/ISO_Fortran_binding_17.c: Likewise. * gfortran.dg/ISO_Fortran_binding_18.c: Likewise. * gfortran.dg/ISO_Fortran_binding_3.c: Likewise. * gfortran.dg/ISO_Fortran_binding_5.c: Likewise. * gfortran.dg/ISO_Fortran_binding_6.c: Likewise. * gfortran.dg/ISO_Fortran_binding_7.c: Likewise. * gfortran.dg/ISO_Fortran_binding_8.c: Likewise. * gfortran.dg/ISO_Fortran_binding_9.c: Likewise. * gfortran.dg/PR94327.c: Likewise. * gfortran.dg/PR94331.c: Likewise. * gfortran.dg/bind_c_array_params_3_aux.c: Likewise. * gfortran.dg/iso_fortran_binding_uint8_array_driver.c: Likewise. * gfortran.dg/pr93524.c: Likewise. --- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_1.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_10.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_11.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_12.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_15.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_16.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_17.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_18.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_3.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_5.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_6.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_7.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_8.c | 2 +- gcc/testsuite/gfortran.dg/ISO_Fortran_binding_9.c | 2 +- gcc/testsuite/gfortran.dg/PR94327.c | 2 +- gcc/testsuite/gfortran.dg/PR94331.c | 2 +- gcc/testsuite/gfortran.dg/bind_c_array_params_3_aux.c | 2 +- .../gfortran.dg/iso_fortran_binding_uint8_array_driver.c | 2 +- gcc/testsuite/gfortran.dg/pr93524.c | 2 +- gcc/testsuite/lib/gfortran.exp | 10 ++++++++-- 20 files changed, 27 insertions(+), 21 deletions(-) (limited to 'gcc') diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_1.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_1.c index bb56ca0..d0d036a 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_1.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_1.c @@ -1,6 +1,6 @@ /* Test F2008 18.5: ISO_Fortran_binding.h functions. */ -#include "ISO_Fortran_binding.h" +#include #include #include #include diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_10.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_10.c index c3954e4..91222ff 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_10.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_10.c @@ -2,7 +2,7 @@ /* Contributed by Reinhold Bader */ -#include "ISO_Fortran_binding.h" +#include #include #include #include diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_11.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_11.c index c2d4e11..e013011 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_11.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_11.c @@ -5,7 +5,7 @@ Contributed by Reinhold Bader #include */ #include #include #include -#include "ISO_Fortran_binding.h" +#include typedef struct { diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_12.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_12.c index 078c5de..0a41576 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_12.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_12.c @@ -2,7 +2,7 @@ #include #include -#include "ISO_Fortran_binding.h" +#include /* Contributed by Reinhold Bader */ diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_15.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_15.c index 622f2de..fc70da4 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_15.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_15.c @@ -4,7 +4,7 @@ #include #include -#include "ISO_Fortran_binding.h" +#include // Prototype for Fortran functions extern void Fsub(CFI_cdesc_t *); diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_16.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_16.c index 50b92ec..915b6e7 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_16.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_16.c @@ -1,6 +1,6 @@ /* Test the fix for PR92142. */ -#include "ISO_Fortran_binding.h" +#include #include diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_17.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_17.c index b0893cc..fa75268 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_17.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_17.c @@ -2,7 +2,7 @@ #include #include -#include "ISO_Fortran_binding.h" +#include void Csub(const CFI_cdesc_t *, size_t, CFI_index_t invalid); diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_18.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_18.c index ef40134..5a3952c 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_18.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_18.c @@ -1,6 +1,6 @@ #include #include -#include "ISO_Fortran_binding.h" +#include extern int do_loop(CFI_cdesc_t* array); diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_3.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_3.c index 9f35b0d..33d1bc3 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_3.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_3.c @@ -1,4 +1,4 @@ -#include "ISO_Fortran_binding.h" +#include #include #include diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_5.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_5.c index 116f548..b18a899 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_5.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_5.c @@ -4,7 +4,7 @@ #include #include -#include "ISO_Fortran_binding.h" +#include typedef struct { int i; diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_6.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_6.c index 704b27c..c7981c5 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_6.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_6.c @@ -4,7 +4,7 @@ #include #include -#include "ISO_Fortran_binding.h" +#include #define DEBUG 0 diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_7.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_7.c index 26b4ab5..8162451 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_7.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_7.c @@ -2,7 +2,7 @@ /* Contributed by Reinhold Bader */ -#include "ISO_Fortran_binding.h" +#include #include #include #include diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_8.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_8.c index a0d1bdc..d3dce3a 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_8.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_8.c @@ -2,7 +2,7 @@ /* Contributed by Reinhold Bader */ -#include "ISO_Fortran_binding.h" +#include #include float Cxgl[] = { 1.1, 2.3, 5.1, 4.2 }; diff --git a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_9.c b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_9.c index 632604f..cb17077 100644 --- a/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_9.c +++ b/gcc/testsuite/gfortran.dg/ISO_Fortran_binding_9.c @@ -2,7 +2,7 @@ /* Contributed by Gilles Gouaillardet */ -#include "ISO_Fortran_binding.h" +#include #include int cdesc_c(CFI_cdesc_t* x, long *expected) diff --git a/gcc/testsuite/gfortran.dg/PR94327.c b/gcc/testsuite/gfortran.dg/PR94327.c index 4ce408d..9d22681 100644 --- a/gcc/testsuite/gfortran.dg/PR94327.c +++ b/gcc/testsuite/gfortran.dg/PR94327.c @@ -4,7 +4,7 @@ #include #include -#include "ISO_Fortran_binding.h" +#include bool c_vrfy (const CFI_cdesc_t *restrict); diff --git a/gcc/testsuite/gfortran.dg/PR94331.c b/gcc/testsuite/gfortran.dg/PR94331.c index 2fbfe0e..df571c7 100644 --- a/gcc/testsuite/gfortran.dg/PR94331.c +++ b/gcc/testsuite/gfortran.dg/PR94331.c @@ -4,7 +4,7 @@ #include #include -#include "ISO_Fortran_binding.h" +#include bool c_vrfy (const CFI_cdesc_t *restrict); diff --git a/gcc/testsuite/gfortran.dg/bind_c_array_params_3_aux.c b/gcc/testsuite/gfortran.dg/bind_c_array_params_3_aux.c index 5176d8b..4594185 100644 --- a/gcc/testsuite/gfortran.dg/bind_c_array_params_3_aux.c +++ b/gcc/testsuite/gfortran.dg/bind_c_array_params_3_aux.c @@ -5,7 +5,7 @@ #include #include -#include "ISO_Fortran_binding.h" +#include void arr_set_c(CFI_cdesc_t*); diff --git a/gcc/testsuite/gfortran.dg/iso_fortran_binding_uint8_array_driver.c b/gcc/testsuite/gfortran.dg/iso_fortran_binding_uint8_array_driver.c index bfd567b..9c2b5fb 100644 --- a/gcc/testsuite/gfortran.dg/iso_fortran_binding_uint8_array_driver.c +++ b/gcc/testsuite/gfortran.dg/iso_fortran_binding_uint8_array_driver.c @@ -1,7 +1,7 @@ #include #include #include -#include "ISO_Fortran_binding.h" +#include extern void fsub(CFI_cdesc_t *); diff --git a/gcc/testsuite/gfortran.dg/pr93524.c b/gcc/testsuite/gfortran.dg/pr93524.c index ba40d00..8a6c066 100644 --- a/gcc/testsuite/gfortran.dg/pr93524.c +++ b/gcc/testsuite/gfortran.dg/pr93524.c @@ -2,7 +2,7 @@ sm incorrectly for dimensions > 2. */ #include // For size_t -#include "ISO_Fortran_binding.h" +#include void my_fortran_sub_1 (CFI_cdesc_t *dv); void my_fortran_sub_2 (CFI_cdesc_t *dv); diff --git a/gcc/testsuite/lib/gfortran.exp b/gcc/testsuite/lib/gfortran.exp index 1e7da11..cae6738 100644 --- a/gcc/testsuite/lib/gfortran.exp +++ b/gcc/testsuite/lib/gfortran.exp @@ -151,6 +151,7 @@ proc gfortran_init { args } { global gcc_warning_prefix global gcc_error_prefix global TEST_ALWAYS_FLAGS + global gfortran_init_set_GFORTRAN_UNDER_TEST # We set LC_ALL and LANG to C so that we get the same error messages as expected. setenv LC_ALL C @@ -166,7 +167,11 @@ proc gfortran_init { args } { setenv LANG C.ASCII } - if ![info exists GFORTRAN_UNDER_TEST] then { + # GFORTRAN_UNDER_TEST as set below contains $specpath, which depends on + # the used multilib config. Thus, its value may need to be reset; + # that's tracked via gfortran_init_set_GFORTRAN_UNDER_TEST. + if { ![info exists GFORTRAN_UNDER_TEST] + || [info exists gfortran_init_set_GFORTRAN_UNDER_TEST] } then { if [info exists TOOL_EXECUTABLE] { set GFORTRAN_UNDER_TEST $TOOL_EXECUTABLE } else { @@ -178,7 +183,8 @@ proc gfortran_init { args } { } else { set specpath [get_multilibs] } - set GFORTRAN_UNDER_TEST [findfile $base_dir/../../gfortran "$base_dir/../../gfortran -B$base_dir/../../ -B$specpath/libgfortran/" [findfile $base_dir/gfortran "$base_dir/gfortran -B$base_dir/" [transform gfortran]]] + set gfortran_init_set_GFORTRAN_UNDER_TEST 1 + set GFORTRAN_UNDER_TEST [findfile $base_dir/../../gfortran "$base_dir/../../gfortran -B$base_dir/../../ -B$specpath/libgfortran/ -I$specpath/libgfortran" [findfile $base_dir/gfortran "$base_dir/gfortran -B$base_dir/" [transform gfortran]]] } } } -- cgit v1.1 From 848bcda52d7431c3be9c33c9803928ae7c54583a Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Mon, 9 Aug 2021 12:02:53 +0100 Subject: Improve handling of unknown sign bit in CCP. This middle-end patch implements several related improvements to tree-ssa's conditional (bit) constant propagation pass. The current code handling ordered comparisons contains the comment "If the most significant bits are not known we know nothing" which is not entirely true [this test even prevents this pass understanding these comparisons always have a zero or one result]. This patch introduces a new value_mask_to_min_max helper function, that understands the different semantics of the most significant bit on signed vs. unsigned values. This allows us to generalize ordered comparisons, GE_EXPR, GT_EXPR, LE_EXPR and LT_EXPR, where to code is tweaked to correctly handle the potential equal cases. Then finally support is added for the related tree codes MIN_EXPR, MAX_EXPR, ABS_EXPR and ABSU_EXPR. Regression testing revealed three test cases in the testsuite that were checking for specific optimizations that are now being performed earlier than expected. These tests can continue to check their original transformations by explicitly adding -fno-tree-ccp to their dg-options (some already specify -fno-ipa-vrp or -fno-tree-forwprop for the same reason). 2021-08-09 Roger Sayle gcc/ChangeLog * tree-ssa-ccp.c (value_mask_to_min_max): Helper function to determine the upper and lower bounds from a mask-value pair. (bit_value_unop) [ABS_EXPR, ABSU_EXPR]: Add support for absolute value and unsigned absolute value expressions. (bit_value_binop): Initialize *VAL's precision. [LT_EXPR, LE_EXPR]: Use value_mask_to_min_max to determine upper and lower bounds of operands. Add LE_EXPR/GE_EXPR support when the operands are unknown but potentially equal. [MIN_EXPR, MAX_EXPR]: Support minimum/maximum expressions. gcc/testsuite/ChangeLog * gcc.dg/pr68217.c: Add -fno-tree-ccp option. * gcc.dg/tree-ssa/vrp24.c: Add -fno-tree-ccp option. * g++.dg/ipa/pure-const-3.C: Add -fno-tree-ccp option. --- gcc/testsuite/g++.dg/ipa/pure-const-3.C | 2 +- gcc/testsuite/gcc.dg/pr68217.c | 2 +- gcc/testsuite/gcc.dg/tree-ssa/vrp24.c | 2 +- gcc/tree-ssa-ccp.c | 118 ++++++++++++++++++++++++++++---- 4 files changed, 107 insertions(+), 17 deletions(-) (limited to 'gcc') diff --git a/gcc/testsuite/g++.dg/ipa/pure-const-3.C b/gcc/testsuite/g++.dg/ipa/pure-const-3.C index 4cf9a6a..172a36b 100644 --- a/gcc/testsuite/g++.dg/ipa/pure-const-3.C +++ b/gcc/testsuite/g++.dg/ipa/pure-const-3.C @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fno-ipa-vrp -fdump-tree-optimized" } */ +/* { dg-options "-O2 -fno-ipa-vrp -fdump-tree-optimized -fno-tree-ccp" } */ int *ptr; static int barvar; static int b(int a); diff --git a/gcc/testsuite/gcc.dg/pr68217.c b/gcc/testsuite/gcc.dg/pr68217.c index c5b0d1f..eb4f15e 100644 --- a/gcc/testsuite/gcc.dg/pr68217.c +++ b/gcc/testsuite/gcc.dg/pr68217.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdisable-tree-evrp -fdump-tree-vrp1" } */ +/* { dg-options "-O2 -fdisable-tree-evrp -fdump-tree-vrp1 -fno-tree-ccp" } */ int foo (void) { diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp24.c b/gcc/testsuite/gcc.dg/tree-ssa/vrp24.c index dfe44b3..91015da 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/vrp24.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp24.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fno-tree-forwprop -fdump-tree-evrp-details -fdump-tree-optimized" } */ +/* { dg-options "-O2 -fno-tree-forwprop -fdump-tree-evrp-details -fdump-tree-optimized -fno-tree-ccp" } */ struct rtx_def; diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c index 9ce6214..003c9c2 100644 --- a/gcc/tree-ssa-ccp.c +++ b/gcc/tree-ssa-ccp.c @@ -1293,6 +1293,28 @@ ccp_fold (gimple *stmt) } } +/* Determine the minimum and maximum values, *MIN and *MAX respectively, + represented by the mask pair VAL and MASK with signedness SGN and + precision PRECISION. */ + +void +value_mask_to_min_max (widest_int *min, widest_int *max, + const widest_int &val, const widest_int &mask, + signop sgn, int precision) +{ + *min = wi::bit_and_not (val, mask); + *max = val | mask; + if (sgn == SIGNED && wi::neg_p (mask)) + { + widest_int sign_bit = wi::lshift (1, precision - 1); + *min ^= sign_bit; + *max ^= sign_bit; + /* MAX is zero extended, and MIN is sign extended. */ + *min = wi::ext (*min, precision, sgn); + *max = wi::ext (*max, precision, sgn); + } +} + /* Apply the operation CODE in type TYPE to the value, mask pair RVAL and RMASK representing a value of type RTYPE and set the value, mask pair *VAL and *MASK to the result. */ @@ -1334,6 +1356,33 @@ bit_value_unop (enum tree_code code, signop type_sgn, int type_precision, break; } + case ABS_EXPR: + case ABSU_EXPR: + if (wi::sext (rmask, rtype_precision) == -1) + *mask = -1; + else if (wi::neg_p (rmask)) + { + /* Result is either rval or -rval. */ + widest_int temv, temm; + bit_value_unop (NEGATE_EXPR, rtype_sgn, rtype_precision, &temv, + &temm, type_sgn, type_precision, rval, rmask); + temm |= (rmask | (rval ^ temv)); + /* Extend the result. */ + *mask = wi::ext (temm, type_precision, type_sgn); + *val = wi::ext (temv, type_precision, type_sgn); + } + else if (wi::neg_p (rval)) + { + bit_value_unop (NEGATE_EXPR, type_sgn, type_precision, val, mask, + type_sgn, type_precision, rval, rmask); + } + else + { + *mask = rmask; + *val = rval; + } + break; + default: *mask = -1; break; @@ -1357,6 +1406,8 @@ bit_value_binop (enum tree_code code, signop sgn, int width, /* Assume we'll get a constant result. Use an initial non varying value, we fall back to varying in the end if necessary. */ *mask = -1; + /* Ensure that VAL is initialized (to any value). */ + *val = 0; switch (code) { @@ -1527,6 +1578,7 @@ bit_value_binop (enum tree_code code, signop sgn, int width, case LT_EXPR: case LE_EXPR: { + widest_int min1, max1, min2, max2; int minmax, maxmin; const widest_int &o1val = swap_p ? r2val : r1val; @@ -1534,26 +1586,21 @@ bit_value_binop (enum tree_code code, signop sgn, int width, const widest_int &o2val = swap_p ? r1val : r2val; const widest_int &o2mask = swap_p ? r1mask : r2mask; - /* If the most significant bits are not known we know nothing. */ - if (wi::neg_p (o1mask) || wi::neg_p (o2mask)) - break; + value_mask_to_min_max (&min1, &max1, o1val, o1mask, + r1type_sgn, r1type_precision); + value_mask_to_min_max (&min2, &max2, o2val, o2mask, + r1type_sgn, r1type_precision); /* For comparisons the signedness is in the comparison operands. */ - sgn = r1type_sgn; - - /* If we know the most significant bits we know the values - value ranges by means of treating varying bits as zero - or one. Do a cross comparison of the max/min pairs. */ - maxmin = wi::cmp (o1val | o1mask, - wi::bit_and_not (o2val, o2mask), sgn); - minmax = wi::cmp (wi::bit_and_not (o1val, o1mask), - o2val | o2mask, sgn); - if (maxmin < 0) /* o1 is less than o2. */ + /* Do a cross comparison of the max/min pairs. */ + maxmin = wi::cmp (max1, min2, r1type_sgn); + minmax = wi::cmp (min1, max2, r1type_sgn); + if (maxmin < (code == LE_EXPR ? 1: 0)) /* o1 < or <= o2. */ { *mask = 0; *val = 1; } - else if (minmax > 0) /* o1 is not less or equal to o2. */ + else if (minmax > (code == LT_EXPR ? -1 : 0)) /* o1 >= or > o2. */ { *mask = 0; *val = 0; @@ -1574,6 +1621,49 @@ bit_value_binop (enum tree_code code, signop sgn, int width, break; } + case MIN_EXPR: + case MAX_EXPR: + { + widest_int min1, max1, min2, max2; + + value_mask_to_min_max (&min1, &max1, r1val, r1mask, sgn, width); + value_mask_to_min_max (&min2, &max2, r2val, r2mask, sgn, width); + + if (wi::cmp (max1, min2, sgn) <= 0) /* r1 is less than r2. */ + { + if (code == MIN_EXPR) + { + *mask = r1mask; + *val = r1val; + } + else + { + *mask = r2mask; + *val = r2val; + } + } + else if (wi::cmp (min1, max2, sgn) >= 0) /* r2 is less than r1. */ + { + if (code == MIN_EXPR) + { + *mask = r2mask; + *val = r2val; + } + else + { + *mask = r1mask; + *val = r1val; + } + } + else + { + /* The result is either r1 or r2. */ + *mask = r1mask | r2mask | (r1val ^ r2val); + *val = r1val; + } + break; + } + default:; } } -- cgit v1.1 From 67b8443bd1f6cfb194eed6043a3acca4369fd09c Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Wed, 4 Aug 2021 14:01:56 +0200 Subject: [documentation] Fix GTY header file example Fix-up for CVS 'gcc/doc/gty.texi' r1.6 (Subversion r55857, Git commit cba57c9d40057fa78efc9a404ab4ae7101a59dcb) "Minor doc updates" gcc/ * doc/gty.texi (Files): Fix GTY header file example. --- gcc/doc/gty.texi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/doc/gty.texi b/gcc/doc/gty.texi index aaf97ae..b90931d 100644 --- a/gcc/doc/gty.texi +++ b/gcc/doc/gty.texi @@ -628,7 +628,7 @@ header file that should be included in the source file you just changed. The file will be called @file{gt-@var{path}.h} where @var{path} is the pathname relative to the @file{gcc} directory with slashes replaced by @verb{|-|}, so for example the header file to be included in -@file{cp/parser.c} is called @file{gt-cp-parser.c}. The +@file{cp/parser.c} is called @file{gt-cp-parser.h}. The generated header file should be included after everything else in the source file. Don't forget to mention this file as a dependency in the @file{Makefile}! -- cgit v1.1 From 7cc85851bc7981b999f9ebadcf3f4b2c34c0ce07 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Wed, 4 Aug 2021 13:52:58 +0200 Subject: [documentation] No need anymore to "mention ['gt-*.h' file] as a dependency in the 'Makefile'" ... as of r202907 (Git commit b6541edc52ed57b6e47150396356d3080ba81034) "remove explicit dependencies". gcc/ * doc/gty.texi (Files): Update. --- gcc/doc/gty.texi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'gcc') diff --git a/gcc/doc/gty.texi b/gcc/doc/gty.texi index b90931d..cf070c1 100644 --- a/gcc/doc/gty.texi +++ b/gcc/doc/gty.texi @@ -630,8 +630,7 @@ pathname relative to the @file{gcc} directory with slashes replaced by @verb{|-|}, so for example the header file to be included in @file{cp/parser.c} is called @file{gt-cp-parser.h}. The generated header file should be included after everything else in the -source file. Don't forget to mention this file as a dependency in the -@file{Makefile}! +source file. @end enumerate -- cgit v1.1 From 06870af3e48daf523a973981f053ee5c6f44c871 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Mon, 2 Aug 2021 18:34:47 +0200 Subject: [OpenACC] Clean up unused 'BUILT_IN_ACC_GET_DEVICE_TYPE' Unused as of r229767 (Git commit e50146711b7200e8f822c6d8239430c682b76e4f) "OpenACC reductions". gcc/ * omp-builtins.def (BUILT_IN_ACC_GET_DEVICE_TYPE): Remove. --- gcc/omp-builtins.def | 2 -- 1 file changed, 2 deletions(-) (limited to 'gcc') diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def index 97964f8..4a7e7ba 100644 --- a/gcc/omp-builtins.def +++ b/gcc/omp-builtins.def @@ -29,8 +29,6 @@ along with GCC; see the file COPYING3. If not see /* The reason why they aren't in gcc/builtins.def is that the Fortran front end doesn't source those. */ -DEF_GOACC_BUILTIN (BUILT_IN_ACC_GET_DEVICE_TYPE, "acc_get_device_type", - BT_FN_INT, ATTR_NOTHROW_LIST) DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DATA_START, "GOACC_data_start", BT_FN_VOID_INT_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST) DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DATA_END, "GOACC_data_end", -- cgit v1.1 From c5230519305946338ddc7107ce45c740812142b4 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Tue, 3 Aug 2021 14:59:56 +0200 Subject: Sanity check that 'Init' doesn't appear without 'Var' in '*.opt' files ... as that doesn't make sense. @item Init(@var{value}) The variable specified by the @code{Var} property should be statically initialized to @var{value}. [...] gcc/ * optc-gen.awk: Sanity check that 'Init' doesn't appear without 'Var'. --- gcc/optc-gen.awk | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'gcc') diff --git a/gcc/optc-gen.awk b/gcc/optc-gen.awk index 880ac77..77e598e 100644 --- a/gcc/optc-gen.awk +++ b/gcc/optc-gen.awk @@ -195,10 +195,14 @@ for (i = 0; i < n_extra_vars; i++) { } for (i = 0; i < n_opts; i++) { name = var_name(flags[i]); - if (name == "") + init = opt_args("Init", flags[i]) + + if (name == "") { + if (init != "") + print "#error " opts[i] " must specify Var to use Init" continue; + } - init = opt_args("Init", flags[i]) if (init != "") { if (name in var_init && var_init[name] != init) print "#error multiple initializers for " name -- cgit v1.1 From 2a700fb8ea95927b02146db2b3338b1f9b868196 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Mon, 2 Aug 2021 18:33:50 +0200 Subject: Don't consider '-foffload-abi' in 'DEF_GOACC_BUILTIN', 'DEF_GOMP_BUILTIN' Since Tom's PR64707 commit r220037 (Git commit 1506ae0e1e865fb7a42fc37a47f1799b71f21c53) "Make fopenmp an LTO option" as well as PR64672 commit r220038 (Git commit a0c88d0629a33161add8d5bc083f1e59f3f756f7) "Make fopenacc an LTO option", we're now actually passing '-fopenacc'/'-fopenmp' to the 'mkoffload's, which will pass these on to the offload compilers. gcc/ * builtins.def (DEF_GOACC_BUILTIN, DEF_GOMP_BUILTIN): Don't consider '-foffload-abi'. * common.opt (-foffload-abi): Remove 'Var', 'Init'. * opts.c (common_handle_option) <-foffload-abi> [ACCEL_COMPILER]: Ignore. --- gcc/builtins.def | 8 ++------ gcc/common.opt | 2 +- gcc/opts.c | 6 ++++-- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'gcc') diff --git a/gcc/builtins.def b/gcc/builtins.def index ec556df..45a09b4 100644 --- a/gcc/builtins.def +++ b/gcc/builtins.def @@ -205,14 +205,11 @@ along with GCC; see the file COPYING3. If not see /* Builtin used by the implementation of OpenACC and OpenMP. Few of these are actually implemented in the compiler; most are in libgomp. */ -/* These builtins also need to be enabled in offloading compilers invoked from - mkoffload; for that purpose, we're checking the -foffload-abi flag here. */ #undef DEF_GOACC_BUILTIN #define DEF_GOACC_BUILTIN(ENUM, NAME, TYPE, ATTRS) \ DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE, \ false, true, true, ATTRS, false, \ - (flag_openacc \ - || flag_offload_abi != OFFLOAD_ABI_UNSET)) + flag_openacc) #undef DEF_GOACC_BUILTIN_COMPILER #define DEF_GOACC_BUILTIN_COMPILER(ENUM, NAME, TYPE, ATTRS) \ DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE, \ @@ -227,8 +224,7 @@ along with GCC; see the file COPYING3. If not see false, true, true, ATTRS, false, \ (flag_openacc \ || flag_openmp \ - || flag_tree_parallelize_loops > 1 \ - || flag_offload_abi != OFFLOAD_ABI_UNSET)) + || flag_tree_parallelize_loops > 1)) /* Builtin used by the implementation of GNU TM. These functions are mapped to the actual implementation of the STM library. */ diff --git a/gcc/common.opt b/gcc/common.opt index d9da113..ed8ab5f 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -2112,7 +2112,7 @@ Common Driver Joined MissingArgError(options or targets=options missing after %q -foffload-options== Specify options for the offloading targets. foffload-abi= -Common Joined RejectNegative Enum(offload_abi) Var(flag_offload_abi) Init(OFFLOAD_ABI_UNSET) +Common Joined RejectNegative Enum(offload_abi) -foffload-abi=[lp64|ilp32] Set the ABI to use in an offload compiler. Enum diff --git a/gcc/opts.c b/gcc/opts.c index 93366e6..1f52e11 100644 --- a/gcc/opts.c +++ b/gcc/opts.c @@ -2737,12 +2737,14 @@ common_handle_option (struct gcc_options *opts, /* Deferred. */ break; -#ifndef ACCEL_COMPILER case OPT_foffload_abi_: +#ifdef ACCEL_COMPILER + /* Handled in the 'mkoffload's. */ +#else error_at (loc, "%<-foffload-abi%> option can be specified only for " "offload compiler"); - break; #endif + break; case OPT_fpack_struct_: if (value <= 0 || (value & (value - 1)) || value > 16) -- cgit v1.1 From 0095afa82a34cdf59a40112b621b348e0087ddb8 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Wed, 4 Aug 2021 13:41:22 +0200 Subject: Remove 'gcc/omp-offload.c' from 'GTFILES' Given that it doesn't contain any 'GTY' markers, no 'gcc/gt-omp-offload.h' file gets generated (and '#include'd anywhere). Small fix-up for r243673 (Git commit 629b3d75c8c5a244d891a9c292bca6912d4b0dd9) "Split omp-low into multiple files". gcc/ * Makefile.in (GTFILES): Remove '$(srcdir)/omp-offload.c'. --- gcc/Makefile.in | 1 - 1 file changed, 1 deletion(-) (limited to 'gcc') diff --git a/gcc/Makefile.in b/gcc/Makefile.in index c0f6e0a..8baa3b7 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -2693,7 +2693,6 @@ GTFILES = $(CPPLIB_H) $(srcdir)/input.h $(srcdir)/coretypes.h \ $(srcdir)/tree-ssa-operands.h \ $(srcdir)/tree-profile.c $(srcdir)/tree-nested.c \ $(srcdir)/omp-offload.h \ - $(srcdir)/omp-offload.c \ $(srcdir)/omp-general.c \ $(srcdir)/omp-low.c \ $(srcdir)/targhooks.c $(out_file) $(srcdir)/passes.c \ -- cgit v1.1 From e2e0b85c1e7cb53fd720df0d09278e3d485c733e Mon Sep 17 00:00:00 2001 From: Tejas Belagod Date: Mon, 9 Aug 2021 11:33:30 +0100 Subject: PR101609: Use the correct iterator for AArch64 vector right shift pattern Loops containing long long shifts fail to vectorize due to the vectorizer not being able to recognize long long right shifts. This is due to a bug in the iterator used for the vashr and vlshr patterns in aarch64-simd.md. 2021-08-09 Tejas Belagod gcc/ChangeLog PR target/101609 * config/aarch64/aarch64-simd.md (vlshr3, vashr3): Use the right iterator. gcc/testsuite/ChangeLog * gcc.target/aarch64/vect-shr-reg.c: New testcase. * gcc.target/aarch64/vect-shr-reg-run.c: Likewise. --- gcc/config/aarch64/aarch64-simd.md | 15 +++--- .../gcc.target/aarch64/vect-shr-reg-run.c | 53 ++++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/vect-shr-reg.c | 30 ++++++++++++ 3 files changed, 89 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-shr-reg-run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-shr-reg.c (limited to 'gcc') diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c5638d0..48eddf6 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1299,13 +1299,10 @@ DONE; }) -;; Using mode VDQ_BHSI as there is no V2DImode neg! -;; Negating individual lanes most certainly offsets the -;; gain from vectorization. (define_expand "vashr3" - [(match_operand:VDQ_BHSI 0 "register_operand") - (match_operand:VDQ_BHSI 1 "register_operand") - (match_operand:VDQ_BHSI 2 "register_operand")] + [(match_operand:VDQ_I 0 "register_operand") + (match_operand:VDQ_I 1 "register_operand") + (match_operand:VDQ_I 2 "register_operand")] "TARGET_SIMD" { rtx neg = gen_reg_rtx (mode); @@ -1333,9 +1330,9 @@ ) (define_expand "vlshr3" - [(match_operand:VDQ_BHSI 0 "register_operand") - (match_operand:VDQ_BHSI 1 "register_operand") - (match_operand:VDQ_BHSI 2 "register_operand")] + [(match_operand:VDQ_I 0 "register_operand") + (match_operand:VDQ_I 1 "register_operand") + (match_operand:VDQ_I 2 "register_operand")] "TARGET_SIMD" { rtx neg = gen_reg_rtx (mode); diff --git a/gcc/testsuite/gcc.target/aarch64/vect-shr-reg-run.c b/gcc/testsuite/gcc.target/aarch64/vect-shr-reg-run.c new file mode 100644 index 0000000..3190448 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-shr-reg-run.c @@ -0,0 +1,53 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -march=armv8.2-a" } */ + +#include "vect-shr-reg.c" + +int +main(void) +{ + int64_t a[16]; + int64_t b[16]; + int64_t c[17]; + + uint64_t ua[16]; + uint64_t ub[16]; + uint64_t uc[17]; + + int64_t res_a[16]; + uint64_t res_ua[16]; + + int i; + + /* Set up inputs. */ + for (i = 0; i < 16; i++) + { + b[i] = -2; + c[i] = 34; + ub[i] = 0xffffffffffffffff; + uc[i] = 52; + } + + /* Set up reference values. */ + for (i = 0; i < 16; i++) + { + res_a[i] = -1LL; + res_ua[i] = 0x0fffLL; + } + + /* Do the shifts. */ + f (ua, ub, uc); + g (a, b, c); + + /* Compare outputs against reference values. */ + for (i = 0; i < 16; i++) + { + if (a[i] != res_a[i]) + __builtin_abort (); + + if (ua[i] != res_ua[i]) + __builtin_abort (); + } + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/vect-shr-reg.c b/gcc/testsuite/gcc.target/aarch64/vect-shr-reg.c new file mode 100644 index 0000000..5736daf --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-shr-reg.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a" } */ + +#include +#include + +#pragma GCC target "+nosve" + +int __attribute__((noinline)) +f(uint64_t *__restrict a, uint64_t *__restrict b, uint64_t *__restrict c) +{ + int i; + + for (i = 0; i < 16; i++) + a[i] = b[i] >> c[i]; +} + + +int __attribute__((noinline)) +g(int64_t *__restrict a, int64_t *__restrict b, int64_t *__restrict c) +{ + int i; + + for (i = 0; i < 16; i++) + a[i] = b[i] >> c[i]; +} + +/* { dg-final { scan-assembler "neg\\tv" } } */ +/* { dg-final { scan-assembler "ushl\\tv" } } */ +/* { dg-final { scan-assembler "sshl\\tv" } } */ -- cgit v1.1 From e2a58ed6dc5293602d0d168475109caa81ad0f0d Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Tue, 2 Mar 2021 04:20:11 -0800 Subject: openacc: Middle-end worker-partitioning support This patch implements worker-partitioning support in the middle end, by rewriting gimple. The OpenACC execution model requires that code can run in either "worker single" mode where only a single worker per gang is active, or "worker partitioned" mode, where multiple workers per gang are active. This means we need to do something equivalent to spawning additional workers when transitioning from worker-single to worker-partitioned mode. However, GPUs typically fix the number of threads of invoked kernels at launch time, so we need to do something with the "extra" threads when they are not wanted. The scheme used is to conditionalise each basic block that executes in "worker single" mode for worker 0 only. Conditional branches are handled specially so "idle" (non-0) workers follow along with worker 0. On transitioning to "worker partitioned" mode, any variables modified by worker 0 are propagated to the other workers via GPU shared memory. Special care is taken for routine calls, writes through pointers, and so forth, as follows: - There are two types of function calls to consider in worker-single mode: "normal" calls to maths library routines, etc. are called from worker 0 only. OpenACC routines may contain worker-partitioned loops themselves, so are called from all workers, including "idle" ones. - SSA names set in worker-single mode, but used in worker-partitioned mode, are copied to shared memory in worker 0. Other workers retrieve the value from the appropriate shared-memory location after a barrier, and new phi nodes are introduced at the convergence point to resolve the worker 0/other worker copies of the value. - Local scalar variables (on the stack) also need special handling. We broadcast any variables that are written in the current worker-single block, and that are read in any worker-partitioned block. (This is believed to be safe, and is flow-insensitive to ease analysis.) - Local aggregates (arrays and composites) on the stack are *not* broadcast. Instead we force gimple stmts modifying elements/fields of local aggregates into fully-partitioned mode. The RHS of the assignment is a scalar, and is thus subject to broadcasting as above. - Writes through pointers may affect any local variable that has its address taken. We use points-to analysis to determine the set of potentially-affected variables for a given pointer indirection. We broadcast any such variable which is used in worker-partitioned mode, on a per-block basis for any block containing a write through a pointer. Some slides about the implementation (from 2018) are available at: https://jtb20.github.io/gcnworkers.pdf gcc/ * Makefile.in (OBJS): Add omp-oacc-neuter-broadcast.o. * doc/tm.texi.in (TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD): Add documentation hook. * doc/tm.texi: Regenerate. * omp-oacc-neuter-broadcast.cc: New file. * omp-builtins.def (BUILT_IN_GOACC_BARRIER) (BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START) (BUILT_IN_GOACC_SINGLE_COPY_END): New builtins. * passes.def (pass_omp_oacc_neuter_broadcast): Add pass. * target.def (goacc.create_worker_broadcast_record): Add target hook. * tree-pass.h (make_pass_omp_oacc_neuter_broadcast): Add prototype. * config/gcn/gcn-protos.h (gcn_goacc_adjust_propagation_record): Rename prototype to... (gcn_goacc_create_worker_broadcast_record): ... this. * config/gcn/gcn-tree.c (gcn_goacc_adjust_propagation_record): Rename function to... (gcn_goacc_create_worker_broadcast_record): ... this. * config/gcn/gcn.c (TARGET_GOACC_ADJUST_PROPAGATION_RECORD): Rename to... (TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD): ... this. Co-Authored-By: Nathan Sidwell (via 'gcc/config/nvptx/nvptx.c' master) Co-Authored-By: Kwok Cheung Yeung Co-Authored-By: Thomas Schwinge --- gcc/Makefile.in | 1 + gcc/config/gcn/gcn-protos.h | 5 +- gcc/config/gcn/gcn-tree.c | 58 +- gcc/config/gcn/gcn.c | 6 +- gcc/doc/tm.texi | 9 + gcc/doc/tm.texi.in | 2 + gcc/omp-builtins.def | 9 + gcc/omp-oacc-neuter-broadcast.cc | 1515 ++++++++++++++++++++++++++++++++++++++ gcc/passes.def | 1 + gcc/target.def | 11 + gcc/tree-pass.h | 1 + 11 files changed, 1584 insertions(+), 34 deletions(-) create mode 100644 gcc/omp-oacc-neuter-broadcast.cc (limited to 'gcc') diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 8baa3b7..6653e9e 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1513,6 +1513,7 @@ OBJS = \ omp-general.o \ omp-low.o \ omp-oacc-kernels-decompose.o \ + omp-oacc-neuter-broadcast.o \ omp-simd-clone.o \ opt-problem.o \ optabs.o \ diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index 8bd0b43..5d62a84 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -38,9 +38,10 @@ extern rtx gcn_full_exec (); extern rtx gcn_full_exec_reg (); extern rtx gcn_gen_undef (machine_mode); extern bool gcn_global_address_p (rtx); -extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender, - const char *name); extern tree gcn_goacc_adjust_private_decl (location_t, tree var, int level); +extern tree gcn_goacc_create_worker_broadcast_record (tree record_type, + bool sender, + const char *name); extern void gcn_goacc_reduction (gcall *call); extern bool gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg); diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c index 1eb8882..f722d2d 100644 --- a/gcc/config/gcn/gcn-tree.c +++ b/gcc/config/gcn/gcn-tree.c @@ -548,35 +548,6 @@ gcn_goacc_reduction (gcall *call) } } -/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD. - - Tweak (worker) propagation record, e.g. to put it in shared memory. */ - -tree -gcn_goacc_adjust_propagation_record (tree record_type, bool sender, - const char *name) -{ - tree type = record_type; - - TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS; - - if (!sender) - type = build_pointer_type (type); - - tree decl = create_tmp_var_raw (type, name); - - if (sender) - { - DECL_CONTEXT (decl) = NULL_TREE; - TREE_STATIC (decl) = 1; - } - - if (sender) - varpool_node::finalize_decl (decl); - - return decl; -} - tree gcn_goacc_adjust_private_decl (location_t, tree var, int level) { @@ -604,4 +575,33 @@ gcn_goacc_adjust_private_decl (location_t, tree var, int level) return var; } +/* Implement TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD. + + Create OpenACC worker state propagation record in shared memory. */ + +tree +gcn_goacc_create_worker_broadcast_record (tree record_type, bool sender, + const char *name) +{ + tree type = record_type; + + TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS; + + if (!sender) + type = build_pointer_type (type); + + tree decl = create_tmp_var_raw (type, name); + + if (sender) + { + DECL_CONTEXT (decl) = NULL_TREE; + TREE_STATIC (decl) = 1; + } + + if (sender) + varpool_node::finalize_decl (decl); + + return decl; +} + /* }}} */ diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index d25c4e5..87af5d1 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -6513,11 +6513,11 @@ gcn_dwarf_register_span (rtx rtl) #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa -#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD -#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \ - gcn_goacc_adjust_propagation_record #undef TARGET_GOACC_ADJUST_PRIVATE_DECL #define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl +#undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD +#define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \ + gcn_goacc_create_worker_broadcast_record #undef TARGET_GOACC_FORK_JOIN #define TARGET_GOACC_FORK_JOIN gcn_fork_join #undef TARGET_GOACC_REDUCTION diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index cb01528..a30fdcb 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6409,6 +6409,15 @@ private variables at OpenACC device-lowering time using the @code{TARGET_GOACC_ADJUST_PRIVATE_DECL} target hook. @end deftypefn +@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name}) +Create a record used to propagate local-variable state from an active +worker to other workers. A possible implementation might adjust the type +of REC to place the new variable in shared GPU memory. + +Presence of this target hook indicates that middle end neutering/broadcasting +be used. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 4a522ae..611fc50 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4223,6 +4223,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_GOACC_EXPAND_VAR_DECL +@hook TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def index 4a7e7ba..05b555c 100644 --- a/gcc/omp-builtins.def +++ b/gcc/omp-builtins.def @@ -59,6 +59,15 @@ DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_PARLEVEL_ID, "goacc_parlevel_id", DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_PARLEVEL_SIZE, "goacc_parlevel_size", BT_FN_INT_INT, ATTR_NOTHROW_LEAF_LIST) +DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_BARRIER, "GOACC_barrier", + BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST) +DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_START, "GOACC_single_start", + BT_FN_BOOL, ATTR_NOTHROW_LEAF_LIST) +DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_COPY_START, "GOACC_single_copy_start", + BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST) +DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_COPY_END, "GOACC_single_copy_end", + BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST) + DEF_GOMP_BUILTIN (BUILT_IN_OMP_GET_THREAD_NUM, "omp_get_thread_num", BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_OMP_GET_NUM_THREADS, "omp_get_num_threads", diff --git a/gcc/omp-oacc-neuter-broadcast.cc b/gcc/omp-oacc-neuter-broadcast.cc new file mode 100644 index 0000000..0f6ba88 --- /dev/null +++ b/gcc/omp-oacc-neuter-broadcast.cc @@ -0,0 +1,1515 @@ +/* OpenACC worker partitioning via middle end neutering/broadcasting scheme + + Copyright (C) 2015-2021 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "gimple.h" +#include "tree-pass.h" +#include "ssa.h" +#include "cgraph.h" +#include "pretty-print.h" +#include "fold-const.h" +#include "gimplify.h" +#include "gimple-iterator.h" +#include "gimple-walk.h" +#include "tree-inline.h" +#include "langhooks.h" +#include "omp-general.h" +#include "omp-low.h" +#include "gimple-pretty-print.h" +#include "cfghooks.h" +#include "insn-config.h" +#include "recog.h" +#include "internal-fn.h" +#include "bitmap.h" +#include "tree-nested.h" +#include "stor-layout.h" +#include "tree-ssa-threadupdate.h" +#include "tree-into-ssa.h" +#include "splay-tree.h" +#include "target.h" +#include "cfgloop.h" +#include "tree-cfg.h" +#include "omp-offload.h" +#include "attribs.h" + +/* Loop structure of the function. The entire function is described as + a NULL loop. */ + +struct parallel_g +{ + /* Parent parallel. */ + parallel_g *parent; + + /* Next sibling parallel. */ + parallel_g *next; + + /* First child parallel. */ + parallel_g *inner; + + /* Partitioning mask of the parallel. */ + unsigned mask; + + /* Partitioning used within inner parallels. */ + unsigned inner_mask; + + /* Location of parallel forked and join. The forked is the first + block in the parallel and the join is the first block after of + the partition. */ + basic_block forked_block; + basic_block join_block; + + gimple *forked_stmt; + gimple *join_stmt; + + gimple *fork_stmt; + gimple *joining_stmt; + + /* Basic blocks in this parallel, but not in child parallels. The + FORKED and JOINING blocks are in the partition. The FORK and JOIN + blocks are not. */ + auto_vec blocks; + + tree record_type; + tree sender_decl; + tree receiver_decl; + +public: + parallel_g (parallel_g *parent, unsigned mode); + ~parallel_g (); +}; + +/* Constructor links the new parallel into it's parent's chain of + children. */ + +parallel_g::parallel_g (parallel_g *parent_, unsigned mask_) + :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) +{ + forked_block = join_block = 0; + forked_stmt = join_stmt = NULL; + fork_stmt = joining_stmt = NULL; + + record_type = NULL_TREE; + sender_decl = NULL_TREE; + receiver_decl = NULL_TREE; + + if (parent) + { + next = parent->inner; + parent->inner = this; + } +} + +parallel_g::~parallel_g () +{ + delete inner; + delete next; +} + +static bool +local_var_based_p (tree decl) +{ + switch (TREE_CODE (decl)) + { + case VAR_DECL: + return !is_global_var (decl); + + case COMPONENT_REF: + case BIT_FIELD_REF: + case ARRAY_REF: + return local_var_based_p (TREE_OPERAND (decl, 0)); + + default: + return false; + } +} + +/* Map of basic blocks to gimple stmts. */ +typedef hash_map bb_stmt_map_t; + +/* Calls to OpenACC routines are made by all workers/wavefronts/warps, since + the routine likely contains partitioned loops (else will do its own + neutering and variable propagation). Return TRUE if a function call CALL + should be made in (worker) single mode instead, rather than redundant + mode. */ + +static bool +omp_sese_active_worker_call (gcall *call) +{ +#define GOMP_DIM_SEQ GOMP_DIM_MAX + tree fndecl = gimple_call_fndecl (call); + + if (!fndecl) + return true; + + tree attrs = oacc_get_fn_attrib (fndecl); + + if (!attrs) + return true; + + int level = oacc_fn_attrib_level (attrs); + + /* Neither regular functions nor "seq" routines should be run by all threads + in worker-single mode. */ + return level == -1 || level == GOMP_DIM_SEQ; +#undef GOMP_DIM_SEQ +} + +/* Split basic blocks such that each forked and join unspecs are at + the start of their basic blocks. Thus afterwards each block will + have a single partitioning mode. We also do the same for return + insns, as they are executed by every thread. Return the + partitioning mode of the function as a whole. Populate MAP with + head and tail blocks. We also clear the BB visited flag, which is + used when finding partitions. */ + +static void +omp_sese_split_blocks (bb_stmt_map_t *map) +{ + auto_vec worklist; + basic_block block; + + /* Locate all the reorg instructions of interest. */ + FOR_ALL_BB_FN (block, cfun) + { + /* Clear visited flag, for use by parallel locator */ + block->flags &= ~BB_VISITED; + + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (gimple_call_internal_p (stmt, IFN_UNIQUE)) + { + enum ifn_unique_kind k = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); + + if (k == IFN_UNIQUE_OACC_JOIN) + worklist.safe_push (stmt); + else if (k == IFN_UNIQUE_OACC_FORK) + { + gcc_assert (gsi_one_before_end_p (gsi)); + basic_block forked_block = single_succ (block); + gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block); + + /* We push a NOP as a placeholder for the "forked" stmt. + This is then recognized in omp_sese_find_par. */ + gimple *nop = gimple_build_nop (); + gsi_insert_before (&gsi2, nop, GSI_SAME_STMT); + + worklist.safe_push (nop); + } + } + else if (gimple_code (stmt) == GIMPLE_RETURN + || gimple_code (stmt) == GIMPLE_COND + || gimple_code (stmt) == GIMPLE_SWITCH + || (gimple_code (stmt) == GIMPLE_CALL + && !gimple_call_internal_p (stmt) + && !omp_sese_active_worker_call (as_a (stmt)))) + worklist.safe_push (stmt); + else if (is_gimple_assign (stmt)) + { + tree lhs = gimple_assign_lhs (stmt); + + /* Force assignments to components/fields/elements of local + aggregates into fully-partitioned (redundant) mode. This + avoids having to broadcast the whole aggregate. The RHS of + the assignment will be propagated using the normal + mechanism. */ + + switch (TREE_CODE (lhs)) + { + case COMPONENT_REF: + case BIT_FIELD_REF: + case ARRAY_REF: + { + tree aggr = TREE_OPERAND (lhs, 0); + + if (local_var_based_p (aggr)) + worklist.safe_push (stmt); + } + break; + + default: + ; + } + } + } + } + + /* Split blocks on the worklist. */ + unsigned ix; + gimple *stmt; + + for (ix = 0; worklist.iterate (ix, &stmt); ix++) + { + basic_block block = gimple_bb (stmt); + + if (gimple_code (stmt) == GIMPLE_COND) + { + gcond *orig_cond = as_a (stmt); + tree_code code = gimple_expr_code (orig_cond); + tree pred = make_ssa_name (boolean_type_node); + gimple *asgn = gimple_build_assign (pred, code, + gimple_cond_lhs (orig_cond), + gimple_cond_rhs (orig_cond)); + gcond *new_cond + = gimple_build_cond (NE_EXPR, pred, boolean_false_node, + gimple_cond_true_label (orig_cond), + gimple_cond_false_label (orig_cond)); + + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_insert_before (&gsi, asgn, GSI_SAME_STMT); + gsi_replace (&gsi, new_cond, true); + + edge e = split_block (block, asgn); + block = e->dest; + map->get_or_insert (block) = new_cond; + } + else if ((gimple_code (stmt) == GIMPLE_CALL + && !gimple_call_internal_p (stmt)) + || is_gimple_assign (stmt)) + { + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_prev (&gsi); + + edge call = split_block (block, gsi_stmt (gsi)); + + gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest)); + + edge call_to_ret = split_block (call->dest, call_stmt); + + map->get_or_insert (call_to_ret->src) = call_stmt; + } + else + { + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_prev (&gsi); + + if (gsi_end_p (gsi)) + map->get_or_insert (block) = stmt; + else + { + /* Split block before insn. The insn is in the new block. */ + edge e = split_block (block, gsi_stmt (gsi)); + + block = e->dest; + map->get_or_insert (block) = stmt; + } + } + } +} + +static const char * +mask_name (unsigned mask) +{ + switch (mask) + { + case 0: return "gang redundant"; + case 1: return "gang partitioned"; + case 2: return "worker partitioned"; + case 3: return "gang+worker partitioned"; + case 4: return "vector partitioned"; + case 5: return "gang+vector partitioned"; + case 6: return "worker+vector partitioned"; + case 7: return "fully partitioned"; + default: return ""; + } +} + +/* Dump this parallel and all its inner parallels. */ + +static void +omp_sese_dump_pars (parallel_g *par, unsigned depth) +{ + fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n", + depth, par->mask, mask_name (par->mask), + par->forked_block ? par->forked_block->index : -1, + par->join_block ? par->join_block->index : -1); + + fprintf (dump_file, " blocks:"); + + basic_block block; + for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++) + fprintf (dump_file, " %d", block->index); + fprintf (dump_file, "\n"); + if (par->inner) + omp_sese_dump_pars (par->inner, depth + 1); + + if (par->next) + omp_sese_dump_pars (par->next, depth); +} + +/* If BLOCK contains a fork/join marker, process it to create or + terminate a loop structure. Add this block to the current loop, + and then walk successor blocks. */ + +static parallel_g * +omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block) +{ + if (block->flags & BB_VISITED) + return par; + block->flags |= BB_VISITED; + + if (gimple **stmtp = map->get (block)) + { + gimple *stmt = *stmtp; + + if (gimple_code (stmt) == GIMPLE_COND + || gimple_code (stmt) == GIMPLE_SWITCH + || gimple_code (stmt) == GIMPLE_RETURN + || (gimple_code (stmt) == GIMPLE_CALL + && !gimple_call_internal_p (stmt)) + || is_gimple_assign (stmt)) + { + /* A single block that is forced to be at the maximum partition + level. Make a singleton par for it. */ + par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG) + | GOMP_DIM_MASK (GOMP_DIM_WORKER) + | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); + par->forked_block = block; + par->forked_stmt = stmt; + par->blocks.safe_push (block); + par = par->parent; + goto walk_successors; + } + else if (gimple_nop_p (stmt)) + { + basic_block pred = single_pred (block); + gcc_assert (pred); + gimple_stmt_iterator gsi = gsi_last_bb (pred); + gimple *final_stmt = gsi_stmt (gsi); + + if (gimple_call_internal_p (final_stmt, IFN_UNIQUE)) + { + gcall *call = as_a (final_stmt); + enum ifn_unique_kind k = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (call, 0))); + + if (k == IFN_UNIQUE_OACC_FORK) + { + HOST_WIDE_INT dim + = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); + unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; + + par = new parallel_g (par, mask); + par->forked_block = block; + par->forked_stmt = final_stmt; + par->fork_stmt = stmt; + } + else + gcc_unreachable (); + } + else + gcc_unreachable (); + } + else if (gimple_call_internal_p (stmt, IFN_UNIQUE)) + { + gcall *call = as_a (stmt); + enum ifn_unique_kind k = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (call, 0))); + if (k == IFN_UNIQUE_OACC_JOIN) + { + HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); + unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; + + gcc_assert (par->mask == mask); + par->join_block = block; + par->join_stmt = stmt; + par = par->parent; + } + else + gcc_unreachable (); + } + else + gcc_unreachable (); + } + + if (par) + /* Add this block onto the current loop's list of blocks. */ + par->blocks.safe_push (block); + else + /* This must be the entry block. Create a NULL parallel. */ + par = new parallel_g (0, 0); + +walk_successors: + /* Walk successor blocks. */ + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, block->succs) + omp_sese_find_par (map, par, e->dest); + + return par; +} + +/* DFS walk the CFG looking for fork & join markers. Construct + loop structures as we go. MAP is a mapping of basic blocks + to head & tail markers, discovered when splitting blocks. This + speeds up the discovery. We rely on the BB visited flag having + been cleared when splitting blocks. */ + +static parallel_g * +omp_sese_discover_pars (bb_stmt_map_t *map) +{ + basic_block block; + + /* Mark exit blocks as visited. */ + block = EXIT_BLOCK_PTR_FOR_FN (cfun); + block->flags |= BB_VISITED; + + /* And entry block as not. */ + block = ENTRY_BLOCK_PTR_FOR_FN (cfun); + block->flags &= ~BB_VISITED; + + parallel_g *par = omp_sese_find_par (map, 0, block); + + if (dump_file) + { + fprintf (dump_file, "\nLoops\n"); + omp_sese_dump_pars (par, 0); + fprintf (dump_file, "\n"); + } + + return par; +} + +static void +populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single, + bitmap vector_single, unsigned outer_mask, + int depth) +{ + unsigned mask = outer_mask | par->mask; + + basic_block block; + + for (unsigned i = 0; par->blocks.iterate (i, &block); i++) + { + if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) + bitmap_set_bit (worker_single, block->index); + + if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0) + bitmap_set_bit (vector_single, block->index); + } + + if (par->inner) + populate_single_mode_bitmaps (par->inner, worker_single, vector_single, + mask, depth + 1); + if (par->next) + populate_single_mode_bitmaps (par->next, worker_single, vector_single, + outer_mask, depth); +} + +/* A map from SSA names or var decls to record fields. */ + +typedef hash_map field_map_t; + +/* For each propagation record type, this is a map from SSA names or var decls + to propagate, to the field in the record type that should be used for + transmission and reception. */ + +typedef hash_map record_field_map_t; + +static GTY(()) record_field_map_t *field_map; + +static void +install_var_field (tree var, tree record_type) +{ + field_map_t *fields = *field_map->get (record_type); + tree name; + char tmp[20]; + + if (TREE_CODE (var) == SSA_NAME) + { + name = SSA_NAME_IDENTIFIER (var); + if (!name) + { + sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var)); + name = get_identifier (tmp); + } + } + else if (TREE_CODE (var) == VAR_DECL) + { + name = DECL_NAME (var); + if (!name) + { + sprintf (tmp, "D_%u", (unsigned) DECL_UID (var)); + name = get_identifier (tmp); + } + } + else + gcc_unreachable (); + + gcc_assert (!fields->get (var)); + + tree type = TREE_TYPE (var); + + if (POINTER_TYPE_P (type) + && TYPE_RESTRICT (type)) + type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT); + + tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type); + + if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var)) + { + SET_DECL_ALIGN (field, DECL_ALIGN (var)); + DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var); + TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var); + } + else + SET_DECL_ALIGN (field, TYPE_ALIGN (type)); + + fields->put (var, field); + + insert_field_into_struct (record_type, field); +} + +/* Sets of SSA_NAMES or VAR_DECLs to propagate. */ +typedef hash_set propagation_set; + +static void +find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask, + bitmap worker_single, bitmap vector_single, + vec *prop_set) +{ + unsigned mask = outer_mask | par->mask; + + if (par->inner) + find_ssa_names_to_propagate (par->inner, mask, worker_single, + vector_single, prop_set); + if (par->next) + find_ssa_names_to_propagate (par->next, outer_mask, worker_single, + vector_single, prop_set); + + if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + { + basic_block block; + int ix; + + for (ix = 0; par->blocks.iterate (ix, &block); ix++) + { + for (gphi_iterator psi = gsi_start_phis (block); + !gsi_end_p (psi); gsi_next (&psi)) + { + gphi *phi = psi.phi (); + use_operand_p use; + ssa_op_iter iter; + + FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE) + { + tree var = USE_FROM_PTR (use); + + if (TREE_CODE (var) != SSA_NAME) + continue; + + gimple *def_stmt = SSA_NAME_DEF_STMT (var); + + if (gimple_nop_p (def_stmt)) + continue; + + basic_block def_bb = gimple_bb (def_stmt); + + if (bitmap_bit_p (worker_single, def_bb->index)) + { + if (!(*prop_set)[def_bb->index]) + (*prop_set)[def_bb->index] = new propagation_set; + + propagation_set *ws_prop = (*prop_set)[def_bb->index]; + + ws_prop->add (var); + } + } + } + + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + use_operand_p use; + ssa_op_iter iter; + gimple *stmt = gsi_stmt (gsi); + + FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE) + { + tree var = USE_FROM_PTR (use); + + gimple *def_stmt = SSA_NAME_DEF_STMT (var); + + if (gimple_nop_p (def_stmt)) + continue; + + basic_block def_bb = gimple_bb (def_stmt); + + if (bitmap_bit_p (worker_single, def_bb->index)) + { + if (!(*prop_set)[def_bb->index]) + (*prop_set)[def_bb->index] = new propagation_set; + + propagation_set *ws_prop = (*prop_set)[def_bb->index]; + + ws_prop->add (var); + } + } + } + } + } +} + +/* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a + statement. */ + +static tree +find_partitioned_var_uses_1 (tree *node, int *, void *data) +{ + walk_stmt_info *wi = (walk_stmt_info *) data; + hash_set *partitioned_var_uses = (hash_set *) wi->info; + + if (!wi->is_lhs && VAR_P (*node)) + partitioned_var_uses->add (*node); + + return NULL_TREE; +} + +static void +find_partitioned_var_uses (parallel_g *par, unsigned outer_mask, + hash_set *partitioned_var_uses) +{ + unsigned mask = outer_mask | par->mask; + + if (par->inner) + find_partitioned_var_uses (par->inner, mask, partitioned_var_uses); + if (par->next) + find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses); + + if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + { + basic_block block; + int ix; + + for (ix = 0; par->blocks.iterate (ix, &block); ix++) + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + wi.info = (void *) partitioned_var_uses; + walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi); + } + } +} + +/* Gang-private variables (typically placed in a GPU's shared memory) do not + need to be processed by the worker-propagation mechanism. Populate the + GANG_PRIVATE_VARS set with any such variables found in the current + function. */ + +static void +find_gang_private_vars (hash_set *gang_private_vars) +{ + basic_block block; + + FOR_EACH_BB_FN (block, cfun) + { + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (gimple_call_internal_p (stmt, IFN_UNIQUE)) + { + enum ifn_unique_kind k = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); + if (k == IFN_UNIQUE_OACC_PRIVATE) + { + HOST_WIDE_INT level + = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); + if (level != GOMP_DIM_GANG) + continue; + for (unsigned i = 3; i < gimple_call_num_args (stmt); i++) + { + tree arg = gimple_call_arg (stmt, i); + gcc_assert (TREE_CODE (arg) == ADDR_EXPR); + tree decl = TREE_OPERAND (arg, 0); + gang_private_vars->add (decl); + } + } + } + } + } +} + +static void +find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask, + hash_set *partitioned_var_uses, + hash_set *gang_private_vars, + vec *prop_set) +{ + unsigned mask = outer_mask | par->mask; + + if (par->inner) + find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses, + gang_private_vars, prop_set); + if (par->next) + find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses, + gang_private_vars, prop_set); + + if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) + { + basic_block block; + int ix; + + for (ix = 0; par->blocks.iterate (ix, &block); ix++) + { + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + tree var; + unsigned i; + + FOR_EACH_LOCAL_DECL (cfun, i, var) + { + if (!VAR_P (var) + || is_global_var (var) + || AGGREGATE_TYPE_P (TREE_TYPE (var)) + || !partitioned_var_uses->contains (var) + || gang_private_vars->contains (var)) + continue; + + if (stmt_may_clobber_ref_p (stmt, var)) + { + if (dump_file) + { + fprintf (dump_file, "bb %u: local variable may be " + "clobbered in %s mode: ", block->index, + mask_name (mask)); + print_generic_expr (dump_file, var, TDF_SLIM); + fprintf (dump_file, "\n"); + } + + if (!(*prop_set)[block->index]) + (*prop_set)[block->index] = new propagation_set; + + propagation_set *ws_prop + = (*prop_set)[block->index]; + + ws_prop->add (var); + } + } + } + } + } +} + +/* Transform basic blocks FROM, TO (which may be the same block) into: + if (GOACC_single_start ()) + BLOCK; + GOACC_barrier (); + \ | / + +----+ + | | (new) predicate block + +----+-- + \ | / \ | / |t \ + +----+ +----+ +----+ | + | | | | ===> | | | f (old) from block + +----+ +----+ +----+ | + | t/ \f | / + +----+/ + (split (split before | | skip block + at end) condition) +----+ + t/ \f +*/ + +static void +worker_single_simple (basic_block from, basic_block to, + hash_set *def_escapes_block) +{ + gimple *call, *cond; + tree lhs, decl; + basic_block skip_block; + + gimple_stmt_iterator gsi = gsi_last_bb (to); + if (EDGE_COUNT (to->succs) > 1) + { + gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND); + gsi_prev (&gsi); + } + edge e = split_block (to, gsi_stmt (gsi)); + skip_block = e->dest; + + gimple_stmt_iterator start = gsi_after_labels (from); + + decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START); + lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); + call = gimple_build_call (decl, 0); + gimple_call_set_lhs (call, lhs); + gsi_insert_before (&start, call, GSI_NEW_STMT); + update_stmt (call); + + cond = gimple_build_cond (EQ_EXPR, lhs, + fold_convert_loc (UNKNOWN_LOCATION, + TREE_TYPE (lhs), + boolean_true_node), + NULL_TREE, NULL_TREE); + gsi_insert_after (&start, cond, GSI_NEW_STMT); + update_stmt (cond); + + edge et = split_block (from, cond); + et->flags &= ~EDGE_FALLTHRU; + et->flags |= EDGE_TRUE_VALUE; + /* Make the active worker the more probable path so we prefer fallthrough + (letting the idle workers jump around more). */ + et->probability = profile_probability::likely (); + + edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE); + ef->probability = et->probability.invert (); + + basic_block neutered = split_edge (ef); + gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered); + + for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + ssa_op_iter iter; + tree var; + + FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF) + { + if (def_escapes_block->contains (var)) + { + gphi *join_phi = create_phi_node (NULL_TREE, skip_block); + create_new_def_for (var, join_phi, + gimple_phi_result_ptr (join_phi)); + add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION); + + tree neutered_def = copy_ssa_name (var, NULL); + /* We really want "don't care" or some value representing + undefined here, but optimizers will probably get rid of the + zero-assignments anyway. */ + gassign *zero = gimple_build_assign (neutered_def, + build_zero_cst (TREE_TYPE (neutered_def))); + + gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING); + update_stmt (zero); + + add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered), + UNKNOWN_LOCATION); + update_stmt (join_phi); + } + } + } + + gsi = gsi_start_bb (skip_block); + + decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); + gimple *acc_bar = gimple_build_call (decl, 0); + + gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT); + update_stmt (acc_bar); +} + +/* This is a copied and renamed omp-low.c:omp_build_component_ref. */ + +static tree +oacc_build_component_ref (tree obj, tree field) +{ + tree field_type = TREE_TYPE (field); + tree obj_type = TREE_TYPE (obj); + if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (obj_type))) + field_type = build_qualified_type + (field_type, + KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (obj_type))); + + tree ret = build3 (COMPONENT_REF, field_type, obj, field, NULL); + if (TREE_THIS_VOLATILE (field)) + TREE_THIS_VOLATILE (ret) |= 1; + if (TREE_READONLY (field)) + TREE_READONLY (ret) |= 1; + return ret; +} + +static tree +build_receiver_ref (tree record_type, tree var, tree receiver_decl) +{ + field_map_t *fields = *field_map->get (record_type); + tree x = build_simple_mem_ref (receiver_decl); + tree field = *fields->get (var); + TREE_THIS_NOTRAP (x) = 1; + x = oacc_build_component_ref (x, field); + return x; +} + +static tree +build_sender_ref (tree record_type, tree var, tree sender_decl) +{ + field_map_t *fields = *field_map->get (record_type); + tree field = *fields->get (var); + return oacc_build_component_ref (sender_decl, field); +} + +static int +sort_by_ssa_version_or_uid (const void *p1, const void *p2) +{ + const tree t1 = *(const tree *)p1; + const tree t2 = *(const tree *)p2; + + if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME) + return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2); + else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME) + return -1; + else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME) + return 1; + else + return DECL_UID (t1) - DECL_UID (t2); +} + +static int +sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2) +{ + const tree t1 = *(const tree *)p1; + const tree t2 = *(const tree *)p2; + unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1))); + unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2))); + if (s1 != s2) + return s2 - s1; + else + return sort_by_ssa_version_or_uid (p1, p2); +} + +static void +worker_single_copy (basic_block from, basic_block to, + hash_set *def_escapes_block, + hash_set *worker_partitioned_uses, + tree record_type) +{ + /* If we only have virtual defs, we'll have no record type, but we still want + to emit single_copy_start and (particularly) single_copy_end to act as + a vdef source on the neutered edge representing memory writes on the + non-neutered edge. */ + if (!record_type) + record_type = char_type_node; + + tree sender_decl + = targetm.goacc.create_worker_broadcast_record (record_type, true, + ".oacc_worker_o"); + tree receiver_decl + = targetm.goacc.create_worker_broadcast_record (record_type, false, + ".oacc_worker_i"); + + gimple_stmt_iterator gsi = gsi_last_bb (to); + if (EDGE_COUNT (to->succs) > 1) + gsi_prev (&gsi); + edge e = split_block (to, gsi_stmt (gsi)); + basic_block barrier_block = e->dest; + + gimple_stmt_iterator start = gsi_after_labels (from); + + tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START); + + tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); + + gimple *call = gimple_build_call (decl, 1, + build_fold_addr_expr (sender_decl)); + gimple_call_set_lhs (call, lhs); + gsi_insert_before (&start, call, GSI_NEW_STMT); + update_stmt (call); + + tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); + + gimple *conv = gimple_build_assign (conv_tmp, + fold_convert (TREE_TYPE (receiver_decl), + lhs)); + update_stmt (conv); + gsi_insert_after (&start, conv, GSI_NEW_STMT); + gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp); + gsi_insert_after (&start, asgn, GSI_NEW_STMT); + update_stmt (asgn); + + tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0); + + tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); + asgn = gimple_build_assign (recv_tmp, receiver_decl); + gsi_insert_after (&start, asgn, GSI_NEW_STMT); + update_stmt (asgn); + + gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE, + NULL_TREE); + update_stmt (cond); + + gsi_insert_after (&start, cond, GSI_NEW_STMT); + + edge et = split_block (from, cond); + et->flags &= ~EDGE_FALLTHRU; + et->flags |= EDGE_TRUE_VALUE; + /* Make the active worker the more probable path so we prefer fallthrough + (letting the idle workers jump around more). */ + et->probability = profile_probability::likely (); + + basic_block body = et->dest; + + edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE); + ef->probability = et->probability.invert (); + + decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); + gimple *acc_bar = gimple_build_call (decl, 0); + + gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block); + gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT); + + cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE); + gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT); + + edge et2 = split_block (barrier_block, cond); + et2->flags &= ~EDGE_FALLTHRU; + et2->flags |= EDGE_TRUE_VALUE; + et2->probability = profile_probability::unlikely (); + + basic_block exit_block = et2->dest; + + basic_block copyout_block = split_edge (et2); + edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE); + ef2->probability = et2->probability.invert (); + + gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block); + + edge copyout_to_exit = single_succ_edge (copyout_block); + + gimple_seq sender_seq = NULL; + + /* Make sure we iterate over definitions in a stable order. */ + auto_vec escape_vec (def_escapes_block->elements ()); + for (hash_set::iterator it = def_escapes_block->begin (); + it != def_escapes_block->end (); ++it) + escape_vec.quick_push (*it); + escape_vec.qsort (sort_by_ssa_version_or_uid); + + for (unsigned i = 0; i < escape_vec.length (); i++) + { + tree var = escape_vec[i]; + + if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var)) + continue; + + tree barrier_def = 0; + + if (TREE_CODE (var) == SSA_NAME) + { + gimple *def_stmt = SSA_NAME_DEF_STMT (var); + + if (gimple_nop_p (def_stmt)) + continue; + + /* The barrier phi takes one result from the actual work of the + block we're neutering, and the other result is constant zero of + the same type. */ + + gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block); + barrier_def = create_new_def_for (var, barrier_phi, + gimple_phi_result_ptr (barrier_phi)); + + add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION); + add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef, + UNKNOWN_LOCATION); + + update_stmt (barrier_phi); + } + else + gcc_assert (TREE_CODE (var) == VAR_DECL); + + /* If we had no record type, we will have no fields map. */ + field_map_t **fields_p = field_map->get (record_type); + field_map_t *fields = fields_p ? *fields_p : NULL; + + if (worker_partitioned_uses->contains (var) + && fields + && fields->get (var)) + { + tree neutered_def = make_ssa_name (TREE_TYPE (var)); + + /* Receive definition from shared memory block. */ + + tree receiver_ref = build_receiver_ref (record_type, var, + receiver_decl); + gassign *recv = gimple_build_assign (neutered_def, + receiver_ref); + gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING); + update_stmt (recv); + + if (TREE_CODE (var) == VAR_DECL) + { + /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy + to the final location now. */ + gassign *asgn = gimple_build_assign (var, neutered_def); + gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING); + update_stmt (asgn); + } + else + { + /* If it's an SSA name, create a new phi at the join node to + represent either the output from the active worker (the + barrier) or the inactive workers (the copyout block). */ + gphi *join_phi = create_phi_node (NULL_TREE, exit_block); + create_new_def_for (barrier_def, join_phi, + gimple_phi_result_ptr (join_phi)); + add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION); + add_phi_arg (join_phi, neutered_def, copyout_to_exit, + UNKNOWN_LOCATION); + update_stmt (join_phi); + } + + /* Send definition to shared memory block. */ + + tree sender_ref = build_sender_ref (record_type, var, sender_decl); + + if (TREE_CODE (var) == SSA_NAME) + { + gassign *send = gimple_build_assign (sender_ref, var); + gimple_seq_add_stmt (&sender_seq, send); + update_stmt (send); + } + else if (TREE_CODE (var) == VAR_DECL) + { + tree tmp = make_ssa_name (TREE_TYPE (var)); + gassign *send = gimple_build_assign (tmp, var); + gimple_seq_add_stmt (&sender_seq, send); + update_stmt (send); + send = gimple_build_assign (sender_ref, tmp); + gimple_seq_add_stmt (&sender_seq, send); + update_stmt (send); + } + else + gcc_unreachable (); + } + } + + /* It's possible for the ET->DEST block (the work done by the active thread) + to finish with a control-flow insn, e.g. a UNIQUE function call. Split + the block and add SENDER_SEQ in the latter part to avoid having control + flow in the middle of a BB. */ + + decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END); + call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl)); + gimple_seq_add_stmt (&sender_seq, call); + + gsi = gsi_last_bb (body); + gimple *last = gsi_stmt (gsi); + basic_block sender_block = split_block (body, last)->dest; + gsi = gsi_last_bb (sender_block); + gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING); +} + +static void +neuter_worker_single (parallel_g *par, unsigned outer_mask, + bitmap worker_single, bitmap vector_single, + vec *prop_set, + hash_set *partitioned_var_uses) +{ + unsigned mask = outer_mask | par->mask; + + if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) + { + basic_block block; + + for (unsigned i = 0; par->blocks.iterate (i, &block); i++) + { + bool has_defs = false; + hash_set def_escapes_block; + hash_set worker_partitioned_uses; + unsigned j; + tree var; + + FOR_EACH_SSA_NAME (j, var, cfun) + { + if (SSA_NAME_IS_VIRTUAL_OPERAND (var)) + { + has_defs = true; + continue; + } + + gimple *def_stmt = SSA_NAME_DEF_STMT (var); + + if (gimple_nop_p (def_stmt)) + continue; + + if (gimple_bb (def_stmt)->index != block->index) + continue; + + gimple *use_stmt; + imm_use_iterator use_iter; + bool uses_outside_block = false; + bool worker_partitioned_use = false; + + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var) + { + int blocknum = gimple_bb (use_stmt)->index; + + /* Don't propagate SSA names that are only used in the + current block, unless the usage is in a phi node: that + means the name left the block, then came back in at the + top. */ + if (blocknum != block->index + || gimple_code (use_stmt) == GIMPLE_PHI) + uses_outside_block = true; + if (!bitmap_bit_p (worker_single, blocknum)) + worker_partitioned_use = true; + } + + if (uses_outside_block) + def_escapes_block.add (var); + + if (worker_partitioned_use) + { + worker_partitioned_uses.add (var); + has_defs = true; + } + } + + propagation_set *ws_prop = (*prop_set)[block->index]; + + if (ws_prop) + { + for (propagation_set::iterator it = ws_prop->begin (); + it != ws_prop->end (); + ++it) + { + tree var = *it; + if (TREE_CODE (var) == VAR_DECL) + { + def_escapes_block.add (var); + if (partitioned_var_uses->contains (var)) + { + worker_partitioned_uses.add (var); + has_defs = true; + } + } + } + + delete ws_prop; + (*prop_set)[block->index] = 0; + } + + tree record_type = (tree) block->aux; + + if (has_defs) + worker_single_copy (block, block, &def_escapes_block, + &worker_partitioned_uses, record_type); + else + worker_single_simple (block, block, &def_escapes_block); + } + } + + if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) + { + basic_block block; + + for (unsigned i = 0; par->blocks.iterate (i, &block); i++) + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (gimple_code (stmt) == GIMPLE_CALL + && !gimple_call_internal_p (stmt) + && !omp_sese_active_worker_call (as_a (stmt))) + { + /* If we have an OpenACC routine call in worker-single mode, + place barriers before and afterwards to prevent + clobbering re-used shared memory regions (as are used + for AMDGCN at present, for example). */ + tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); + gsi_insert_before (&gsi, gimple_build_call (decl, 0), + GSI_SAME_STMT); + gsi_insert_after (&gsi, gimple_build_call (decl, 0), + GSI_NEW_STMT); + } + } + } + + if (par->inner) + neuter_worker_single (par->inner, mask, worker_single, vector_single, + prop_set, partitioned_var_uses); + if (par->next) + neuter_worker_single (par->next, outer_mask, worker_single, vector_single, + prop_set, partitioned_var_uses); +} + +static int +execute_omp_oacc_neuter_broadcast () +{ + bb_stmt_map_t bb_stmt_map; + auto_bitmap worker_single, vector_single; + + omp_sese_split_blocks (&bb_stmt_map); + + if (dump_file) + { + fprintf (dump_file, "\n\nAfter splitting:\n\n"); + dump_function_to_file (current_function_decl, dump_file, dump_flags); + } + + unsigned mask = 0; + + /* If this is a routine, calculate MASK as if the outer levels are already + partitioned. */ + tree attr = oacc_get_fn_attrib (current_function_decl); + if (attr) + { + tree dims = TREE_VALUE (attr); + unsigned ix; + for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) + { + tree allowed = TREE_PURPOSE (dims); + if (allowed && integer_zerop (allowed)) + mask |= GOMP_DIM_MASK (ix); + } + } + + parallel_g *par = omp_sese_discover_pars (&bb_stmt_map); + populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0); + + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + bb->aux = NULL; + + field_map = record_field_map_t::create_ggc (40); + + vec prop_set; + prop_set.create (last_basic_block_for_fn (cfun)); + + for (int i = 0; i < last_basic_block_for_fn (cfun); i++) + prop_set.quick_push (0); + + find_ssa_names_to_propagate (par, mask, worker_single, vector_single, + &prop_set); + + hash_set partitioned_var_uses; + hash_set gang_private_vars; + + find_gang_private_vars (&gang_private_vars); + find_partitioned_var_uses (par, mask, &partitioned_var_uses); + find_local_vars_to_propagate (par, mask, &partitioned_var_uses, + &gang_private_vars, &prop_set); + + FOR_ALL_BB_FN (bb, cfun) + { + propagation_set *ws_prop = prop_set[bb->index]; + if (ws_prop) + { + tree record_type = lang_hooks.types.make_type (RECORD_TYPE); + tree name = create_tmp_var_name (".oacc_ws_data_s"); + name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type); + DECL_ARTIFICIAL (name) = 1; + DECL_NAMELESS (name) = 1; + TYPE_NAME (record_type) = name; + TYPE_ARTIFICIAL (record_type) = 1; + + auto_vec field_vec (ws_prop->elements ()); + for (hash_set::iterator it = ws_prop->begin (); + it != ws_prop->end (); ++it) + field_vec.quick_push (*it); + + field_vec.qsort (sort_by_size_then_ssa_version_or_uid); + + field_map->put (record_type, field_map_t::create_ggc (17)); + + /* Insert var fields in reverse order, so the last inserted element + is the first in the structure. */ + for (int i = field_vec.length () - 1; i >= 0; i--) + install_var_field (field_vec[i], record_type); + + layout_type (record_type); + + bb->aux = (tree) record_type; + } + } + + neuter_worker_single (par, mask, worker_single, vector_single, &prop_set, + &partitioned_var_uses); + + prop_set.release (); + + /* This doesn't seem to make a difference. */ + loops_state_clear (LOOP_CLOSED_SSA); + + /* Neutering worker-single neutered blocks will invalidate dominance info. + It may be possible to incrementally update just the affected blocks, but + obliterate everything for now. */ + free_dominance_info (CDI_DOMINATORS); + free_dominance_info (CDI_POST_DOMINATORS); + + if (dump_file) + { + fprintf (dump_file, "\n\nAfter neutering:\n\n"); + dump_function_to_file (current_function_decl, dump_file, dump_flags); + } + + return 0; +} + +namespace { + +const pass_data pass_data_omp_oacc_neuter_broadcast = +{ + GIMPLE_PASS, /* type */ + "omp_oacc_neuter_broadcast", /* name */ + OPTGROUP_OMP, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_cfg, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */ +}; + +class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass +{ +public: + pass_omp_oacc_neuter_broadcast (gcc::context *ctxt) + : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return (flag_openacc + && targetm.goacc.create_worker_broadcast_record); + }; + + virtual unsigned int execute (function *) + { + return execute_omp_oacc_neuter_broadcast (); + } + +}; // class pass_omp_oacc_neuter_broadcast + +} // anon namespace + +gimple_opt_pass * +make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt) +{ + return new pass_omp_oacc_neuter_broadcast (ctxt); +} diff --git a/gcc/passes.def b/gcc/passes.def index 26d86df..d7a1f8c 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -184,6 +184,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_fixup_cfg); NEXT_PASS (pass_lower_eh_dispatch); NEXT_PASS (pass_oacc_loop_designation); + NEXT_PASS (pass_omp_oacc_neuter_broadcast); NEXT_PASS (pass_oacc_device_lower); NEXT_PASS (pass_omp_device_lower); NEXT_PASS (pass_omp_target_link); diff --git a/gcc/target.def b/gcc/target.def index 68a46aa..7676d5e 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1756,6 +1756,17 @@ private variables at OpenACC device-lowering time using the\n\ rtx, (tree var), NULL) +DEFHOOK +(create_worker_broadcast_record, +"Create a record used to propagate local-variable state from an active\n\ +worker to other workers. A possible implementation might adjust the type\n\ +of REC to place the new variable in shared GPU memory.\n\ +\n\ +Presence of this target hook indicates that middle end neutering/broadcasting\n\ +be used.", +tree, (tree rec, bool sender, const char *name), +NULL) + HOOK_VECTOR_END (goacc) /* Functions relating to vectorization. */ diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 5484ad5..83941bc 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -425,6 +425,7 @@ extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt); extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt); extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt); extern gimple_opt_pass *make_pass_oacc_loop_designation (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt); extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt); extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt); extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt); -- cgit v1.1 From c408512e1f7ca07e07794dc13fd6dfd9d2d7e998 Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Tue, 2 Mar 2021 04:20:13 -0800 Subject: amdgcn: Enable OpenACC worker partitioning for AMD GCN gcc/ * config/gcn/gcn.c (gcn_init_builtins): Override decls for BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START, BUILT_IN_GOACC_SINGLE_COPY_END and BUILT_IN_GOACC_BARRIER. (gcn_goacc_validate_dims): Turn on worker partitioning unconditionally. (gcn_fork_join): Update comment. * config/gcn/gcn.opt (flag_worker_partitioning): Remove. (macc_experimental_workers): Remove unused option. libgomp/ * plugin/plugin-gcn.c (gcn_exec): Change default number of workers to 16. * testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c [acc_device_radeon]: Update. * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c [ACC_DEVICE_TYPE_radeon]: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c [acc_device_radeon]: Likewise. * testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c [ACC_DEVICE_TYPE_radeon]: Likewise. * testsuite/libgomp.oacc-fortran/optional-reduction.f90: XFAIL for 'openacc_radeon_accel_selected' and '-O0'. * testsuite/libgomp.oacc-fortran/reduction-7.f90: Likewise. Co-Authored-By: Kwok Cheung Yeung Co-Authored-By: Thomas Schwinge --- gcc/config/gcn/gcn.c | 15 +++------------ gcc/config/gcn/gcn.opt | 5 ----- 2 files changed, 3 insertions(+), 17 deletions(-) (limited to 'gcc') diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 87af5d1..9df2827 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -3712,8 +3712,6 @@ gcn_init_builtins (void) TREE_NOTHROW (gcn_builtin_decls[i]) = 1; } -/* FIXME: remove the ifdef once OpenACC support is merged upstream. */ -#ifdef BUILT_IN_GOACC_SINGLE_START /* These builtins need to take/return an LDS pointer: override the generic versions here. */ @@ -3730,7 +3728,6 @@ gcn_init_builtins (void) set_builtin_decl (BUILT_IN_GOACC_BARRIER, gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false); -#endif } /* Implement TARGET_INIT_LIBFUNCS. */ @@ -5019,11 +5016,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned /*used*/) { bool changed = false; - - /* FIXME: remove -facc-experimental-workers when they're ready. */ - int max_workers = flag_worker_partitioning ? 16 : 1; - - gcc_assert (!flag_worker_partitioning); + const int max_workers = 16; /* The vector size must appear to be 64, to the user, unless this is a SEQ routine. The real, internal value is always 1, which means use @@ -5060,8 +5053,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, { dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS; if (dims[GOMP_DIM_WORKER] < 0) - dims[GOMP_DIM_WORKER] = (flag_worker_partitioning - ? GCN_DEFAULT_WORKERS : 1); + dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS; if (dims[GOMP_DIM_GANG] < 0) dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS; changed = true; @@ -5126,8 +5118,7 @@ static bool gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims), bool ARG_UNUSED (is_fork)) { - /* GCN does not use the fork/join concept invented for NVPTX. - Instead we use standard autovectorization. */ + /* GCN does not need to expand fork/join markers at the RTL level. */ return false; } diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index b2b10b0..6faacca 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -62,11 +62,6 @@ bool flag_bypass_init_error = false mbypass-init-error Target RejectNegative Var(flag_bypass_init_error) -bool flag_worker_partitioning = false - -macc-experimental-workers -Target Var(flag_worker_partitioning) Init(0) - int stack_size_opt = -1 mstack-size= -- cgit v1.1 From 62f01243fb27030b8d99c671f27349c2e7465edc Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Mon, 9 Aug 2021 12:21:43 +0200 Subject: Cross-reference parts adapted in 'gcc/omp-oacc-neuter-broadcast.cc' gcc/ * config/nvptx/nvptx.c: Cross-reference parts adapted in 'gcc/omp-oacc-neuter-broadcast.cc'. * omp-low.c: Likewise. * omp-oacc-neuter-broadcast.cc: Cross-reference parts adapted from the above files. --- gcc/config/nvptx/nvptx.c | 5 +++++ gcc/omp-low.c | 2 ++ gcc/omp-oacc-neuter-broadcast.cc | 9 ++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 6642bdf..4e4909e 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -3205,6 +3205,7 @@ nvptx_mach_vector_length () /* Loop structure of the function. The entire function is described as a NULL loop. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */ struct parallel { @@ -3282,6 +3283,7 @@ typedef auto_vec insn_bb_vec_t; partitioning mode of the function as a whole. Populate MAP with head and tail blocks. We also clear the BB visited flag, which is used when finding partitions. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */ static void nvptx_split_blocks (bb_insn_map_t *map) @@ -3383,6 +3385,7 @@ nvptx_discover_pre (basic_block block, int expected) } /* Dump this parallel and all its inner parallels. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */ static void nvptx_dump_pars (parallel *par, unsigned depth) @@ -3408,6 +3411,7 @@ nvptx_dump_pars (parallel *par, unsigned depth) /* If BLOCK contains a fork/join marker, process it to create or terminate a loop structure. Add this block to the current loop, and then walk successor blocks. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */ static parallel * nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) @@ -3488,6 +3492,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) to head & tail markers, discovered when splitting blocks. This speeds up the discovery. We rely on the BB visited flag having been cleared when splitting blocks. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */ static parallel * nvptx_discover_pars (bb_insn_map_t *map) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 2f735bc..926087d 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -615,6 +615,8 @@ omp_copy_decl_1 (tree var, omp_context *ctx) /* Build COMPONENT_REF and set TREE_THIS_VOLATILE and TREE_READONLY on it as appropriate. */ +/* See also 'gcc/omp-oacc-neuter-broadcast.cc:oacc_build_component_ref'. */ + static tree omp_build_component_ref (tree obj, tree field) { diff --git a/gcc/omp-oacc-neuter-broadcast.cc b/gcc/omp-oacc-neuter-broadcast.cc index 0f6ba88..f855538 100644 --- a/gcc/omp-oacc-neuter-broadcast.cc +++ b/gcc/omp-oacc-neuter-broadcast.cc @@ -56,6 +56,7 @@ /* Loop structure of the function. The entire function is described as a NULL loop. */ +/* Adapted from 'gcc/config/nvptx/nvptx.c:struct parallel'. */ struct parallel_g { @@ -183,6 +184,7 @@ omp_sese_active_worker_call (gcall *call) partitioning mode of the function as a whole. Populate MAP with head and tail blocks. We also clear the BB visited flag, which is used when finding partitions. */ +/* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_split_blocks'. */ static void omp_sese_split_blocks (bb_stmt_map_t *map) @@ -341,6 +343,7 @@ mask_name (unsigned mask) } /* Dump this parallel and all its inner parallels. */ +/* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_dump_pars'. */ static void omp_sese_dump_pars (parallel_g *par, unsigned depth) @@ -366,6 +369,7 @@ omp_sese_dump_pars (parallel_g *par, unsigned depth) /* If BLOCK contains a fork/join marker, process it to create or terminate a loop structure. Add this block to the current loop, and then walk successor blocks. */ +/* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_find_par'. */ static parallel_g * omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block) @@ -471,6 +475,7 @@ walk_successors: to head & tail markers, discovered when splitting blocks. This speeds up the discovery. We rely on the BB visited flag having been cleared when splitting blocks. */ +/* Adapted from 'gcc/config/nvptx/nvptx.c:nvptx_discover_pars'. */ static parallel_g * omp_sese_discover_pars (bb_stmt_map_t *map) @@ -931,7 +936,9 @@ worker_single_simple (basic_block from, basic_block to, update_stmt (acc_bar); } -/* This is a copied and renamed omp-low.c:omp_build_component_ref. */ +/* Build COMPONENT_REF and set TREE_THIS_VOLATILE and TREE_READONLY on it + as appropriate. */ +/* Adapted from 'gcc/omp-low.c:omp_build_component_ref'. */ static tree oacc_build_component_ref (tree obj, tree field) -- cgit v1.1 From 9d2d660aab2f332b1e3f69a2fb3419cf3cc33b47 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 9 Aug 2021 16:38:54 +0200 Subject: i386: Name V2SF logic insns [PR101812] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Name V2SF logic insns, so expand_simple_binop works with V2SF modes. 2021-08-09 Uroš Bizjak gcc/ PR target/101812 * config/i386/mmx.md (v2sf3): Rename from *mmx_v2sf3 gcc/testsuite/ PR target/101812 * gcc.target/i386/pr101812.c: New test. --- gcc/config/i386/mmx.md | 2 +- gcc/testsuite/gcc.target/i386/pr101812.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr101812.c (limited to 'gcc') diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 0984f7c..2d3b63f 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -952,7 +952,7 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "V4SF")]) -(define_insn "*mmx_v2sf3" +(define_insn "v2sf3" [(set (match_operand:V2SF 0 "register_operand" "=x,x") (any_logic:V2SF (match_operand:V2SF 1 "register_operand" "%0,x") diff --git a/gcc/testsuite/gcc.target/i386/pr101812.c b/gcc/testsuite/gcc.target/i386/pr101812.c new file mode 100644 index 0000000..07e84a3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101812.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -ftree-loop-vectorize -fvect-cost-model=unlimited" } */ + +#define LTGT(a, b) (__builtin_islessgreater (a, b) ? a : b) +void foo (int ilast,float* w, float* w2) +{ + int i; + for (i = 0; i < ilast; ++i) + { + w[i] = LTGT (0.0f, w2[i]); + } +} -- cgit v1.1 From 00eab082e9f6ac2a7c4b38323829be29f092abcb Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Mon, 9 Aug 2021 10:05:49 -0500 Subject: Verify destination[source] of a load[store] instruction is a register. gcc/ChangeLog: * config/rs6000/rs6000.c (is_load_insn1): Verify destination is a register. (is_store_insn1): Verify source is a register. --- gcc/config/rs6000/rs6000.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'gcc') diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 5b1c06b..60f406a 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -18363,7 +18363,12 @@ is_load_insn1 (rtx pat, rtx *load_mem) return false; if (GET_CODE (pat) == SET) - return find_mem_ref (SET_SRC (pat), load_mem); + { + if (REG_P (SET_DEST (pat))) + return find_mem_ref (SET_SRC (pat), load_mem); + else + return false; + } if (GET_CODE (pat) == PARALLEL) { @@ -18400,7 +18405,12 @@ is_store_insn1 (rtx pat, rtx *str_mem) return false; if (GET_CODE (pat) == SET) - return find_mem_ref (SET_DEST (pat), str_mem); + { + if (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat))) + return find_mem_ref (SET_DEST (pat), str_mem); + else + return false; + } if (GET_CODE (pat) == PARALLEL) { -- cgit v1.1 From d55d3f5b04e81b79f34ccf23f7e2c1e736ad3b0d Mon Sep 17 00:00:00 2001 From: Martin Jambor Date: Mon, 9 Aug 2021 17:35:39 +0200 Subject: ipa: Fix testsuite/gcc.dg/ipa/remref-6.c I forgot to add -fdump-ipa-inline to options of testsuite/gcc.dg/ipa/remref-6.c and so the dump scan test were not PASSing but ended up as UNRESOLVED. Fixing that revealed that the one of the dumps it was looking for had a double space, so I removed it too. gcc/ChangeLog: 2021-08-09 Martin Jambor PR testsuite/101654 * ipa-prop.c (propagate_controlled_uses): Removed a spurious space. gcc/testsuite/ChangeLog: 2021-08-09 Martin Jambor PR testsuite/101654 * gcc.dg/ipa/remref-6.c: Added missing -fdump-ipa-inline option. --- gcc/ipa-prop.c | 2 +- gcc/testsuite/gcc.dg/ipa/remref-6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc') diff --git a/gcc/ipa-prop.c b/gcc/ipa-prop.c index 0afd05e..1c69d97 100644 --- a/gcc/ipa-prop.c +++ b/gcc/ipa-prop.c @@ -4204,7 +4204,7 @@ propagate_controlled_uses (struct cgraph_edge *cs) new_root->create_reference (n, IPA_REF_LOAD, NULL); if (dump_file) fprintf (dump_file, "ipa-prop: ...replaced it with " - " LOAD one from %s to %s.\n", + "LOAD one from %s to %s.\n", new_root->dump_name (), n->dump_name ()); } diff --git a/gcc/testsuite/gcc.dg/ipa/remref-6.c b/gcc/testsuite/gcc.dg/ipa/remref-6.c index de36493..7deae31 100644 --- a/gcc/testsuite/gcc.dg/ipa/remref-6.c +++ b/gcc/testsuite/gcc.dg/ipa/remref-6.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fno-early-inlining -fno-ipa-cp -fdump-tree-optimized" } */ +/* { dg-options "-O2 -fno-early-inlining -fno-ipa-cp -fdump-ipa-inline -fdump-tree-optimized" } */ static double global = 0.0; -- cgit v1.1 From c86c95edd165d674614516cda0b1fcb6616c1096 Mon Sep 17 00:00:00 2001 From: Andrew MacLeod Date: Mon, 9 Aug 2021 15:53:42 -0400 Subject: Ensure toupper and tolower follow the expected pattern. If the parameter is not compatible with the LHS, assume this is not really a builtin function to avoid a trap. gcc/ PR tree-optimization/101741 * gimple-range-fold.cc (fold_using_range::range_of_builtin_call): Check type of parameter for toupper/tolower. gcc/testsuite/ * gcc.dg/pr101741.c: New. --- gcc/gimple-range-fold.cc | 6 ++++++ gcc/testsuite/gcc.dg/pr101741.c | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/pr101741.c (limited to 'gcc') diff --git a/gcc/gimple-range-fold.cc b/gcc/gimple-range-fold.cc index 410bc4d..d3e3e14 100644 --- a/gcc/gimple-range-fold.cc +++ b/gcc/gimple-range-fold.cc @@ -894,6 +894,9 @@ fold_using_range::range_of_builtin_call (irange &r, gcall *call, case CFN_BUILT_IN_TOUPPER: { arg = gimple_call_arg (call, 0); + // If the argument isn't compatible with the LHS, do nothing. + if (!range_compatible_p (type, TREE_TYPE (arg))) + return false; if (!src.get_operand (r, arg)) return false; @@ -913,6 +916,9 @@ fold_using_range::range_of_builtin_call (irange &r, gcall *call, case CFN_BUILT_IN_TOLOWER: { arg = gimple_call_arg (call, 0); + // If the argument isn't compatible with the LHS, do nothing. + if (!range_compatible_p (type, TREE_TYPE (arg))) + return false; if (!src.get_operand (r, arg)) return false; diff --git a/gcc/testsuite/gcc.dg/pr101741.c b/gcc/testsuite/gcc.dg/pr101741.c new file mode 100644 index 0000000..6587dca --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr101741.c @@ -0,0 +1,16 @@ +/* PR tree-optimization/101741 */ +/* { dg-do compile } */ +/* { dg-options "-O2 " } */ + +int +foo (void); + +unsigned int +toupper (int c) +{ + c = foo (); + while (c) + c = toupper (c); + + return c; +} -- cgit v1.1 From 377681505fb192876e277697e29d201e528d484a Mon Sep 17 00:00:00 2001 From: GCC Administrator Date: Tue, 10 Aug 2021 00:16:28 +0000 Subject: Daily bump. --- gcc/ChangeLog | 123 ++++++++++++++++++++++++++++++++++++++++++++++++ gcc/DATESTAMP | 2 +- gcc/testsuite/ChangeLog | 67 ++++++++++++++++++++++++++ 3 files changed, 191 insertions(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9d39f0f..e2ffd84 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,126 @@ +2021-08-09 Andrew MacLeod + + PR tree-optimization/101741 + * gimple-range-fold.cc (fold_using_range::range_of_builtin_call): Check + type of parameter for toupper/tolower. + +2021-08-09 Martin Jambor + + PR testsuite/101654 + * ipa-prop.c (propagate_controlled_uses): Removed a spurious space. + +2021-08-09 Pat Haugen + + * config/rs6000/rs6000.c (is_load_insn1): Verify destination is a + register. + (is_store_insn1): Verify source is a register. + +2021-08-09 Uroš Bizjak + + PR target/101812 + * config/i386/mmx.md (v2sf3): + Rename from *mmx_v2sf3 + +2021-08-09 Thomas Schwinge + + * config/nvptx/nvptx.c: Cross-reference parts adapted in + 'gcc/omp-oacc-neuter-broadcast.cc'. + * omp-low.c: Likewise. + * omp-oacc-neuter-broadcast.cc: Cross-reference parts adapted from + the above files. + +2021-08-09 Julian Brown + Kwok Cheung Yeung + Thomas Schwinge + + * config/gcn/gcn.c (gcn_init_builtins): Override decls for + BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START, + BUILT_IN_GOACC_SINGLE_COPY_END and BUILT_IN_GOACC_BARRIER. + (gcn_goacc_validate_dims): Turn on worker partitioning unconditionally. + (gcn_fork_join): Update comment. + * config/gcn/gcn.opt (flag_worker_partitioning): Remove. + (macc_experimental_workers): Remove unused option. + +2021-08-09 Julian Brown + Nathan Sidwell (via 'gcc/config/nvptx/nvptx.c' master) + Kwok Cheung Yeung + Thomas Schwinge + + * Makefile.in (OBJS): Add omp-oacc-neuter-broadcast.o. + * doc/tm.texi.in (TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD): + Add documentation hook. + * doc/tm.texi: Regenerate. + * omp-oacc-neuter-broadcast.cc: New file. + * omp-builtins.def (BUILT_IN_GOACC_BARRIER) + (BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START) + (BUILT_IN_GOACC_SINGLE_COPY_END): New builtins. + * passes.def (pass_omp_oacc_neuter_broadcast): Add pass. + * target.def (goacc.create_worker_broadcast_record): Add target + hook. + * tree-pass.h (make_pass_omp_oacc_neuter_broadcast): Add + prototype. + * config/gcn/gcn-protos.h (gcn_goacc_adjust_propagation_record): + Rename prototype to... + (gcn_goacc_create_worker_broadcast_record): ... this. + * config/gcn/gcn-tree.c (gcn_goacc_adjust_propagation_record): Rename + function to... + (gcn_goacc_create_worker_broadcast_record): ... this. + * config/gcn/gcn.c (TARGET_GOACC_ADJUST_PROPAGATION_RECORD): + Rename to... + (TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD): ... this. + +2021-08-09 Tejas Belagod + + PR target/101609 + * config/aarch64/aarch64-simd.md (vlshr3, vashr3): Use + the right iterator. + +2021-08-09 Thomas Schwinge + + * Makefile.in (GTFILES): Remove '$(srcdir)/omp-offload.c'. + +2021-08-09 Thomas Schwinge + + * builtins.def (DEF_GOACC_BUILTIN, DEF_GOMP_BUILTIN): Don't + consider '-foffload-abi'. + * common.opt (-foffload-abi): Remove 'Var', 'Init'. + * opts.c (common_handle_option) <-foffload-abi> [ACCEL_COMPILER]: + Ignore. + +2021-08-09 Thomas Schwinge + + * optc-gen.awk: Sanity check that 'Init' doesn't appear without + 'Var'. + +2021-08-09 Thomas Schwinge + + * omp-builtins.def (BUILT_IN_ACC_GET_DEVICE_TYPE): Remove. + +2021-08-09 Thomas Schwinge + + * doc/gty.texi (Files): Update. + +2021-08-09 Thomas Schwinge + + * doc/gty.texi (Files): Fix GTY header file example. + +2021-08-09 Roger Sayle + + * tree-ssa-ccp.c (value_mask_to_min_max): Helper function to + determine the upper and lower bounds from a mask-value pair. + (bit_value_unop) [ABS_EXPR, ABSU_EXPR]: Add support for + absolute value and unsigned absolute value expressions. + (bit_value_binop): Initialize *VAL's precision. + [LT_EXPR, LE_EXPR]: Use value_mask_to_min_max to determine + upper and lower bounds of operands. Add LE_EXPR/GE_EXPR + support when the operands are unknown but potentially equal. + [MIN_EXPR, MAX_EXPR]: Support minimum/maximum expressions. + +2021-08-09 Bin Cheng + + * config/aarch64/aarch64.md + (*extend2_aarch64): Use %0. + 2021-08-08 Sergei Trofimovich * lra-constraints.c: Fix s/otput/output/ typo. diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP index 859da5a..7eb9baf 100644 --- a/gcc/DATESTAMP +++ b/gcc/DATESTAMP @@ -1 +1 @@ -20210809 +20210810 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 034fc30..d0d2584 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,70 @@ +2021-08-09 Andrew MacLeod + + * gcc.dg/pr101741.c: New. + +2021-08-09 Martin Jambor + + PR testsuite/101654 + * gcc.dg/ipa/remref-6.c: Added missing -fdump-ipa-inline option. + +2021-08-09 Uroš Bizjak + + PR target/101812 + * gcc.target/i386/pr101812.c: New test. + +2021-08-09 Tejas Belagod + + * gcc.target/aarch64/vect-shr-reg.c: New testcase. + * gcc.target/aarch64/vect-shr-reg-run.c: Likewise. + +2021-08-09 Roger Sayle + + * gcc.dg/pr68217.c: Add -fno-tree-ccp option. + * gcc.dg/tree-ssa/vrp24.c: Add -fno-tree-ccp option. + * g++.dg/ipa/pure-const-3.C: Add -fno-tree-ccp option. + +2021-08-09 Tobias Burnus + + PR libfortran/101305 + PR fortran/101660 + * lib/gfortran.exp (gfortran_init): Add -I $specdir/libgfortran to + GFORTRAN_UNDER_TEST; update it when set by previous gfortran_init call. + * gfortran.dg/ISO_Fortran_binding_1.c: Use <...> not "..." for + ISO_Fortran_binding.h's #include. + * gfortran.dg/ISO_Fortran_binding_10.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_11.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_12.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_15.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_16.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_17.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_18.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_3.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_5.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_6.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_7.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_8.c: Likewise. + * gfortran.dg/ISO_Fortran_binding_9.c: Likewise. + * gfortran.dg/PR94327.c: Likewise. + * gfortran.dg/PR94331.c: Likewise. + * gfortran.dg/bind_c_array_params_3_aux.c: Likewise. + * gfortran.dg/iso_fortran_binding_uint8_array_driver.c: Likewise. + * gfortran.dg/pr93524.c: Likewise. + +2021-08-09 Jonathan Wright + + * gcc.target/aarch64/sve/dup_lane_1.c: Don't split + scan-assembler tests over multiple lines. Expect 32-bit + result values in 'w' registers. + * gcc.target/aarch64/sve/extract_1.c: Likewise. + * gcc.target/aarch64/sve/extract_2.c: Likewise. + * gcc.target/aarch64/sve/extract_3.c: Likewise. + * gcc.target/aarch64/sve/extract_4.c: Likewise. + +2021-08-09 Jonathan Wright + + * gcc.target/aarch64/vector_structure_intrinsics.c: Restrict + tests to little-endian targets. + 2021-08-08 Jeff Law * gcc.target/tic6x/rotdi16-scan.c: Pull rotate into its own function. -- cgit v1.1 From 813ccbe9d272cd67a8f075beea280de95f807492 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 5 Aug 2021 17:51:48 +0800 Subject: Support cond_ashr/lshr/ashl for vector integer modes under AVX512. gcc/ChangeLog: * config/i386/sse.md (cond_): New expander. (VI248_AVX512VLBW): New mode iterator. * config/i386/predicates.md (nonimmediate_or_const_vec_dup_operand): New predicate. gcc/testsuite/ChangeLog: * gcc.target/i386/cond_op_shift_d-1.c: New test. * gcc.target/i386/cond_op_shift_d-2.c: New test. * gcc.target/i386/cond_op_shift_q-1.c: New test. * gcc.target/i386/cond_op_shift_q-2.c: New test. * gcc.target/i386/cond_op_shift_ud-1.c: New test. * gcc.target/i386/cond_op_shift_ud-2.c: New test. * gcc.target/i386/cond_op_shift_uq-1.c: New test. * gcc.target/i386/cond_op_shift_uq-2.c: New test. * gcc.target/i386/cond_op_shift_uw-1.c: New test. * gcc.target/i386/cond_op_shift_uw-2.c: New test. * gcc.target/i386/cond_op_shift_w-1.c: New test. * gcc.target/i386/cond_op_shift_w-2.c: New test. --- gcc/config/i386/predicates.md | 4 + gcc/config/i386/sse.md | 36 ++++++++ gcc/testsuite/gcc.target/i386/cond_op_shift_d-1.c | 56 +++++++++++ gcc/testsuite/gcc.target/i386/cond_op_shift_d-2.c | 102 +++++++++++++++++++++ gcc/testsuite/gcc.target/i386/cond_op_shift_q-1.c | 11 +++ gcc/testsuite/gcc.target/i386/cond_op_shift_q-2.c | 5 + gcc/testsuite/gcc.target/i386/cond_op_shift_ud-1.c | 10 ++ gcc/testsuite/gcc.target/i386/cond_op_shift_ud-2.c | 5 + gcc/testsuite/gcc.target/i386/cond_op_shift_uq-1.c | 10 ++ gcc/testsuite/gcc.target/i386/cond_op_shift_uq-2.c | 5 + gcc/testsuite/gcc.target/i386/cond_op_shift_uw-1.c | 8 ++ gcc/testsuite/gcc.target/i386/cond_op_shift_uw-2.c | 6 ++ gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c | 8 ++ gcc/testsuite/gcc.target/i386/cond_op_shift_w-2.c | 6 ++ 14 files changed, 272 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_d-1.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_d-2.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_q-1.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_q-2.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_ud-1.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_ud-2.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_uq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_uq-2.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_uw-1.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_uw-2.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_shift_w-2.c (limited to 'gcc') diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 6aa1ea3..129205a 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1161,6 +1161,10 @@ (ior (match_operand 0 "nonimmediate_operand") (match_code "const_vector"))) +(define_predicate "nonimmediate_or_const_vec_dup_operand" + (ior (match_operand 0 "nonimmediate_operand") + (match_test "const_vec_duplicate_p (op)"))) + ;; Return true when OP is either register operand, or any ;; CONST_VECTOR. (define_predicate "reg_or_const_vector_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a46a237..45b1ec2 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -506,6 +506,13 @@ (V4DI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) +(define_mode_iterator VI248_AVX512VLBW + [(V32HI "TARGET_AVX512BW") + (V16HI "TARGET_AVX512VL && TARGET_AVX512BW") + (V8HI "TARGET_AVX512VL && TARGET_AVX512BW") + V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") + V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) + (define_mode_iterator VI48_AVX2 [(V8SI "TARGET_AVX2") V4SI (V4DI "TARGET_AVX2") V2DI]) @@ -22786,6 +22793,35 @@ DONE; }) +(define_expand "cond_" + [(set (match_operand:VI248_AVX512VLBW 0 "register_operand") + (vec_merge:VI248_AVX512VLBW + (any_shift:VI248_AVX512VLBW + (match_operand:VI248_AVX512VLBW 2 "register_operand") + (match_operand:VI248_AVX512VLBW 3 "nonimmediate_or_const_vec_dup_operand")) + (match_operand:VI248_AVX512VLBW 4 "nonimm_or_0_operand") + (match_operand: 1 "register_operand")))] + "TARGET_AVX512F" +{ + if (const_vec_duplicate_p (operands[3])) + { + operands[3] = unwrap_const_vec_duplicate (operands[3]); + operands[3] = lowpart_subreg (DImode, operands[3], mode); + emit_insn (gen_3_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + } + else + emit_insn (gen__v_mask (operands[0], + operands[2], + operands[3], + operands[4], + operands[1])); + DONE; +}) + (define_insn "_ashrv" [(set (match_operand:VI48_AVX512F_AVX512VL 0 "register_operand" "=v") (ashiftrt:VI48_AVX512F_AVX512VL diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_d-1.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_d-1.c new file mode 100644 index 0000000..af047b6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_d-1.c @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times ".COND_SHR" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times ".COND_SHL" 2 "optimized" } } */ +/* { dg-final { scan-assembler-times "vpsrad" 1 } } */ +/* { dg-final { scan-assembler-times "vpslld" 1 } } */ +/* { dg-final { scan-assembler-times "vpsravd" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllvd" 1 } } */ + + +typedef short int16; +typedef unsigned short uint16; +typedef int int32; +typedef unsigned int uint32; +typedef long long int64; +typedef unsigned long long uint64; + +#ifndef NUM +#define NUM 800 +#endif +#ifndef TYPE +#define TYPE int +#endif + +TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM]; +#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#define MAX(X,Y) ((X) < (Y) ? (Y) : (X)) + +#define BINC(OPNAME, OP) \ + void \ + __attribute__ ((noipa,optimize ("O3"))) \ + foo_##OPNAME##_const () \ + { \ + for (int i = 0; i != NUM; i++) \ + if (b[i] < c[i]) \ + a[i] = d[i] OP 3; \ + else \ + a[i] = MAX(d[i], e[i]); \ + } + +#define BINV(OPNAME, OP) \ + void \ + __attribute__ ((noipa,optimize ("O3"))) \ + foo_##OPNAME##_variable () \ + { \ + for (int i = 0; i != NUM; i++) \ + if (b[i] < c[i]) \ + a[i] = d[i] OP e[i]; \ + else \ + a[i] = MAX(d[i], e[i]); \ + } + +BINC (shl, <<); +BINC (shr, >>); +BINV (shl, <<); +BINV (shr, >>); diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_d-2.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_d-2.c new file mode 100644 index 0000000..449e5b4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_d-2.c @@ -0,0 +1,102 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx512vl } */ + +#define AVX512VL +#ifndef CHECK +#define CHECK "avx512f-helper.h" +#endif + +#include CHECK + +#include "cond_op_shift_d-1.c" + +#define BINO2C(OPNAME, OP) \ + void \ + __attribute__ ((noipa,optimize ("O2"))) \ + foo_o2_##OPNAME##_const () \ + { \ + for (int i = 0; i != NUM; i++) \ + if (b[i] < c[i]) \ + j[i] = d[i] OP 3; \ + else \ + j[i] = MAX(d[i], e[i]); \ + } + +#define BINO2V(OPNAME, OP) \ + void \ + __attribute__ ((noipa,optimize ("O2"))) \ + foo_o2_##OPNAME##_variable () \ + { \ + for (int i = 0; i != NUM; i++) \ + if (b[i] < c[i]) \ + j[i] = d[i] OP e[i]; \ + else \ + j[i] = MAX(d[i], e[i]); \ + } + +BINO2C (shl, <<); +BINO2C (shr, >>); +BINO2V (shl, <<); +BINO2V (shr, >>); + +static void +test_256 (void) +{ + int sign = -1; + for (int i = 0; i != NUM; i++) + { + a[i] = 0; + d[i] = i * 2; + e[i] = (i * i * 3 - i * 9 + 6)%8; + b[i] = i * 83; + c[i] = b[i] + sign; + sign *= -1; + j[i] = 1; + } + foo_shl_const (); + foo_o2_shl_const (); + for (int i = 0; i != NUM; i++) + { + if (a[i] != j[i]) + abort (); + a[i] = 0; + b[i] = 1; + } + + foo_shr_const (); + foo_o2_shr_const (); + for (int i = 0; i != NUM; i++) + { + if (a[i] != j[i]) + abort (); + a[i] = 0; + j[i] = 1; + } + + foo_shl_variable (); + foo_o2_shl_variable (); + for (int i = 0; i != NUM; i++) + { + if (a[i] != j[i]) + abort (); + a[i] = 0; + b[i] = 1; + } + + foo_shr_variable (); + foo_o2_shr_variable (); + for (int i = 0; i != NUM; i++) + { + if (a[i] != j[i]) + abort (); + a[i] = 0; + j[i] = 1; + } +} + +static void +test_128 () +{ + +} diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_q-1.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_q-1.c new file mode 100644 index 0000000..1b981b5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_q-1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized -DTYPE=int64" } */ +/* { dg-final { scan-tree-dump-times ".COND_SHR" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times ".COND_SHL" 2 "optimized" } } */ +/* { dg-final { scan-assembler-times "vpsravq" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllvq" 1 } } */ +/* { dg-final { scan-assembler-times "vpsravq" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllvq" 1 } } */ + + +#include "cond_op_shift_d-1.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_q-2.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_q-2.c new file mode 100644 index 0000000..94f1d71 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_q-2.c @@ -0,0 +1,5 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=int64" } */ +/* { dg-require-effective-target avx512vl } */ + +#include "cond_op_shift_d-2.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_ud-1.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_ud-1.c new file mode 100644 index 0000000..eea0f67 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_ud-1.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized -DTYPE=uint32" } */ +/* { dg-final { scan-tree-dump-times ".COND_SHR" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times ".COND_SHL" 2 "optimized" } } */ +/* { dg-final { scan-assembler-times "vpsrlvd" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllvd" 1 } } */ +/* { dg-final { scan-assembler-times "vpsrlvd" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllvd" 1 } } */ + +#include "cond_op_shift_d-1.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_ud-2.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_ud-2.c new file mode 100644 index 0000000..b18c568 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_ud-2.c @@ -0,0 +1,5 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=uint32" } */ +/* { dg-require-effective-target avx512vl } */ + +#include "cond_op_shift_d-2.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_uq-1.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_uq-1.c new file mode 100644 index 0000000..77a0388 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_uq-1.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized -DTYPE=uint64" } */ +/* { dg-final { scan-tree-dump-times ".COND_SHR" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times ".COND_SHL" 2 "optimized" } } */ +/* { dg-final { scan-assembler-times "vpsrlq" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllq" 1 } } */ +/* { dg-final { scan-assembler-times "vpsrlq" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllq" 1 } } */ + +#include "cond_op_shift_d-1.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_uq-2.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_uq-2.c new file mode 100644 index 0000000..a9e0acf --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_uq-2.c @@ -0,0 +1,5 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=uint64" } */ +/* { dg-require-effective-target avx512vl } */ + +#include "cond_op_shift_d-2.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_uw-1.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_uw-1.c new file mode 100644 index 0000000..b84cdd89 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_uw-1.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized -DTYPE=uint16" } */ +/* { dg-final { scan-tree-dump-times ".COND_SHR" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times ".COND_SHL" 1 "optimized" } } */ +/* { dg-final { scan-assembler-times "vpsrlw" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllw" 1 } } */ + +#include "cond_op_shift_d-1.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_uw-2.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_uw-2.c new file mode 100644 index 0000000..cfdece9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_uw-2.c @@ -0,0 +1,6 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=uint16" } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-require-effective-target avx512bw } */ + +#include "cond_op_shift_d-2.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c new file mode 100644 index 0000000..54c854f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized -DTYPE=int16" } */ +/* { dg-final { scan-tree-dump-times ".COND_SHR" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times ".COND_SHL" 1 "optimized" } } */ +/* { dg-final { scan-assembler-times "vpsraw" 1 } } */ +/* { dg-final { scan-assembler-times "vpsllw" 1 } } */ + +#include "cond_op_shift_d-1.c" diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_w-2.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-2.c new file mode 100644 index 0000000..5776826 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-2.c @@ -0,0 +1,6 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=int16" } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-require-effective-target avx512bw } */ + +#include "cond_op_shift_d-2.c" -- cgit v1.1 From 3d7ccbc1efbd475031a9a4a6110c531f71fbf631 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 6 Aug 2021 12:32:01 -0700 Subject: x86: Optimize load of const FP all bits set vectors Check float_vector_all_ones_operand for vector floating-point modes to optimize load of const floating-point all bits set vectors. gcc/ PR target/101804 * config/i386/constraints.md (BC): Document for integer SSE constant all bits set operand. (BF): New constraint for const floating-point all bits set vectors. * config/i386/i386.c (standard_sse_constant_p): Likewise. (standard_sse_constant_opcode): Likewise. * config/i386/sse.md (sseconstm1): New mode attribute. (mov_internal): Replace BC with . gcc/testsuite/ PR target/101804 * gcc.target/i386/avx2-gather-2.c: Pass -march=skylake instead of "-mavx2 -mtune=skylake". Scan vpcmpeqd. --- gcc/config/i386/constraints.md | 10 ++++++++-- gcc/config/i386/i386.c | 11 +++++++++-- gcc/config/i386/sse.md | 11 ++++++++++- gcc/testsuite/gcc.target/i386/avx2-gather-2.c | 3 ++- 4 files changed, 29 insertions(+), 6 deletions(-) (limited to 'gcc') diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index 4aa28a5..87cceac 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -166,7 +166,8 @@ ;; s Sibcall memory operand, not valid for TARGET_X32 ;; w Call memory operand, not valid for TARGET_X32 ;; z Constant call address operand. -;; C SSE constant operand. +;; C Integer SSE constant with all bits set operand. +;; F Floating-point SSE constant with all bits set operand. (define_constraint "Bf" "@internal Flags register operand." @@ -216,11 +217,16 @@ (match_operand 0 "constant_call_address_operand")) (define_constraint "BC" - "@internal SSE constant -1 operand." + "@internal integer SSE constant with all bits set operand." (and (match_test "TARGET_SSE") (ior (match_test "op == constm1_rtx") (match_operand 0 "vector_all_ones_operand")))) +(define_constraint "BF" + "@internal floating-point SSE constant with all bits set operand." + (and (match_test "TARGET_SSE") + (match_operand 0 "float_vector_all_ones_operand"))) + ;; Integer constant constraints. (define_constraint "Wb" "Integer constant in the range 0 @dots{} 7, for 8-bit shifts." diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index aea224a..4d4ab6a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -5073,7 +5073,11 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) if (x == const0_rtx || const0_operand (x, mode)) return 1; - if (x == constm1_rtx || vector_all_ones_operand (x, mode)) + if (x == constm1_rtx + || vector_all_ones_operand (x, mode) + || ((GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + || GET_MODE_CLASS (pred_mode) == MODE_VECTOR_FLOAT) + && float_vector_all_ones_operand (x, mode))) { /* VOIDmode integer constant, get mode from the predicate. */ if (mode == VOIDmode) @@ -5171,7 +5175,10 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) gcc_unreachable (); } } - else if (x == constm1_rtx || vector_all_ones_operand (x, mode)) + else if (x == constm1_rtx + || vector_all_ones_operand (x, mode) + || (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + && float_vector_all_ones_operand (x, mode))) { enum attr_mode insn_mode = get_attr_mode (insn); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 45b1ec2..2b0d10e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -784,6 +784,15 @@ (V4SF "V4SF") (V2DF "V2DF") (TI "TI")]) +;; SSE constant -1 constraint +(define_mode_attr sseconstm1 + [(V64QI "BC") (V32HI "BC") (V16SI "BC") (V8DI "BC") (V4TI "BC") + (V32QI "BC") (V16HI "BC") (V8SI "BC") (V4DI "BC") (V2TI "BC") + (V16QI "BC") (V8HI "BC") (V4SI "BC") (V2DI "BC") (V1TI "BC") + (V16SF "BF") (V8DF "BF") + (V8SF "BF") (V4DF "BF") + (V4SF "BF") (V2DF "BF")]) + ;; Mapping of vector modes to corresponding mask size (define_mode_attr avx512fmaskmode [(V64QI "DI") (V32QI "SI") (V16QI "HI") @@ -1063,7 +1072,7 @@ [(set (match_operand:VMOVE 0 "nonimmediate_operand" "=v,v ,v ,m") (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" - " C,BC,vm,v"))] + " C,,vm,v"))] "TARGET_SSE && (register_operand (operands[0], mode) || register_operand (operands[1], mode))" diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c index 1a704af..ad5ef73 100644 --- a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c +++ b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c @@ -1,6 +1,7 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details -mtune=skylake" } */ +/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake" } */ #include "avx2-gather-1.c" /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 16 "vect" } } */ +/* { dg-final { scan-assembler "vpcmpeqd" } } */ -- cgit v1.1 From 0631faf87a197145acd833249bf8f20a1c4aaabf Mon Sep 17 00:00:00 2001 From: Martin Uecker Date: Tue, 10 Aug 2021 07:42:51 +0200 Subject: Evaluate arguments of sizeof that are structs of variable size. Evaluate arguments of sizeof for all types of variable size and not just for VLAs. This fixes some issues related to [PR29970] where statement expressions need to be evaluated so that the size is well defined. 2021-08-10 Martin Uecker gcc/c/ PR c/29970 * c-typeck.c (c_expr_sizeof_expr): Evaluate size expressions for structs of variable size. gcc/testsuite/ PR c/29970 * gcc.dg/vla-stexp-1.c: New test. --- gcc/c/c-typeck.c | 2 +- gcc/testsuite/gcc.dg/vla-stexp-1.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/vla-stexp-1.c (limited to 'gcc') diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c index 5d6565b..c5bf337 100644 --- a/gcc/c/c-typeck.c +++ b/gcc/c/c-typeck.c @@ -2992,7 +2992,7 @@ c_expr_sizeof_expr (location_t loc, struct c_expr expr) c_last_sizeof_loc = loc; ret.original_code = SIZEOF_EXPR; ret.original_type = NULL; - if (c_vla_type_p (TREE_TYPE (folded_expr))) + if (C_TYPE_VARIABLE_SIZE (TREE_TYPE (folded_expr))) { /* sizeof is evaluated when given a vla (C99 6.5.3.4p2). */ ret.value = build2 (C_MAYBE_CONST_EXPR, TREE_TYPE (ret.value), diff --git a/gcc/testsuite/gcc.dg/vla-stexp-1.c b/gcc/testsuite/gcc.dg/vla-stexp-1.c new file mode 100644 index 0000000..97d6693 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vla-stexp-1.c @@ -0,0 +1,18 @@ +/* PR29970*/ +/* { dg-do run } */ +/* { dg-options "-Wall -O0" } */ + +int foo(void) +{ + int n = 0; + return sizeof(*({ n = 10; struct foo { int x[n]; } x; &x; })); +} + + +int main() +{ + if (sizeof(struct foo { int x[10]; }) != foo()) + __builtin_abort(); + + return 0; +} -- cgit v1.1