diff options
-rw-r--r-- | gcc/ChangeLog | 10 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 51 | ||||
-rw-r--r-- | gcc/simplify-rtx.c | 54 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c | 20 |
5 files changed, 136 insertions, 3 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c434149..1a245f7 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2018-05-17 James Greenhalgh <james.greenhalgh@arm.com> + Kyrylo Tkachov <kyrylo.tkachov@arm.com> + + * config/aarch64/aarch64.c (aarch64_expand_vector_init): Modify + code generation for cases where splatting a value is not useful. + * simplify-rtx.c (simplify_ternary_operation): Simplify + vec_merge across a vec_duplicate and a paradoxical subreg forming a vector + mode to a vec_concat. + + 2018-05-17 Olga Makhotina <olga.makhotina@intel.com> * config.gcc: Support "goldmont-plus". diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index a2003fe..6bf6c05 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -13916,9 +13916,54 @@ aarch64_expand_vector_init (rtx target, rtx vals) maxv = matches[i][1]; } - /* Create a duplicate of the most common element. */ - rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement)); - aarch64_emit_move (target, gen_vec_duplicate (mode, x)); + /* Create a duplicate of the most common element, unless all elements + are equally useless to us, in which case just immediately set the + vector register using the first element. */ + + if (maxv == 1) + { + /* For vectors of two 64-bit elements, we can do even better. */ + if (n_elts == 2 + && (inner_mode == E_DImode + || inner_mode == E_DFmode)) + + { + rtx x0 = XVECEXP (vals, 0, 0); + rtx x1 = XVECEXP (vals, 0, 1); + /* Combine can pick up this case, but handling it directly + here leaves clearer RTL. + + This is load_pair_lanes<mode>, and also gives us a clean-up + for store_pair_lanes<mode>. */ + if (memory_operand (x0, inner_mode) + && memory_operand (x1, inner_mode) + && !STRICT_ALIGNMENT + && rtx_equal_p (XEXP (x1, 0), + plus_constant (Pmode, + XEXP (x0, 0), + GET_MODE_SIZE (inner_mode)))) + { + rtx t; + if (inner_mode == DFmode) + t = gen_load_pair_lanesdf (target, x0, x1); + else + t = gen_load_pair_lanesdi (target, x0, x1); + emit_insn (t); + return; + } + } + /* The subreg-move sequence below will move into lane zero of the + vector register. For big-endian we want that position to hold + the last element of VALS. */ + maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0; + rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement)); + aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode)); + } + else + { + rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement)); + aarch64_emit_move (target, gen_vec_duplicate (mode, x)); + } /* Insert the rest. */ for (int i = 0; i < n_elts; i++) diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c index e96c9d1..d2714db 100644 --- a/gcc/simplify-rtx.c +++ b/gcc/simplify-rtx.c @@ -5891,6 +5891,60 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode, return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1); } + /* Replace: + + (vec_merge:outer (vec_duplicate:outer x:inner) + (subreg:outer y:inner 0) + (const_int N)) + + with (vec_concat:outer x:inner y:inner) if N == 1, + or (vec_concat:outer y:inner x:inner) if N == 2. + + Implicitly, this means we have a paradoxical subreg, but such + a check is cheap, so make it anyway. + + Only applies for vectors of two elements. */ + if (GET_CODE (op0) == VEC_DUPLICATE + && GET_CODE (op1) == SUBREG + && GET_MODE (op1) == GET_MODE (op0) + && GET_MODE (SUBREG_REG (op1)) == GET_MODE (XEXP (op0, 0)) + && paradoxical_subreg_p (op1) + && subreg_lowpart_p (op1) + && known_eq (GET_MODE_NUNITS (GET_MODE (op0)), 2) + && known_eq (GET_MODE_NUNITS (GET_MODE (op1)), 2) + && IN_RANGE (sel, 1, 2)) + { + rtx newop0 = XEXP (op0, 0); + rtx newop1 = SUBREG_REG (op1); + if (sel == 2) + std::swap (newop0, newop1); + return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1); + } + + /* Same as above but with switched operands: + Replace (vec_merge:outer (subreg:outer x:inner 0) + (vec_duplicate:outer y:inner) + (const_int N)) + + with (vec_concat:outer x:inner y:inner) if N == 1, + or (vec_concat:outer y:inner x:inner) if N == 2. */ + if (GET_CODE (op1) == VEC_DUPLICATE + && GET_CODE (op0) == SUBREG + && GET_MODE (op0) == GET_MODE (op1) + && GET_MODE (SUBREG_REG (op0)) == GET_MODE (XEXP (op1, 0)) + && paradoxical_subreg_p (op0) + && subreg_lowpart_p (op0) + && known_eq (GET_MODE_NUNITS (GET_MODE (op1)), 2) + && known_eq (GET_MODE_NUNITS (GET_MODE (op0)), 2) + && IN_RANGE (sel, 1, 2)) + { + rtx newop0 = SUBREG_REG (op0); + rtx newop1 = XEXP (op1, 0); + if (sel == 2) + std::swap (newop0, newop1); + return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1); + } + /* Replace (vec_merge (vec_duplicate x) (vec_duplicate y) (const_int n)) with (vec_concat x y) or (vec_concat y x) depending on value diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index f360067..fe495f8 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2018-05-17 James Greenhalgh <james.greenhalgh@arm.com> + + * gcc.target/aarch64/vect-slp-dup.c: New. + 2018-05-17 Paolo Carlini <paolo.carlini@oracle.com> PR c++/85713 diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c b/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c new file mode 100644 index 0000000..0541e48 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ + +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */ + +void bar (double); + +void +foo (double *restrict in, double *restrict in2, + double *restrict out1, double *restrict out2) +{ + for (int i = 0; i < 1024; i++) + { + out1[i] = in[i] + 2.0 * in[i+128]; + out1[i+1] = in[i+1] + 2.0 * in2[i]; + bar (in[i]); + } +} + +/* { dg-final { scan-assembler-not "dup\tv\[0-9\]+.2d, v\[0-9\]+" } } */ + |