diff options
author | James Greenhalgh <james.greenhalgh@arm.com> | 2016-02-16 16:02:09 +0000 |
---|---|---|
committer | James Greenhalgh <jgreenhalgh@gcc.gnu.org> | 2016-02-16 16:02:09 +0000 |
commit | 35a093b60bcc40cf31e47a2336b50311578fc973 (patch) | |
tree | b2c3f6801774f27a05ac2e7e0e6d644d1ad4f9c8 | |
parent | 51b3f0773f84ef1e3aac56e687f67027c3fb070c (diff) | |
download | gcc-35a093b60bcc40cf31e47a2336b50311578fc973.zip gcc-35a093b60bcc40cf31e47a2336b50311578fc973.tar.gz gcc-35a093b60bcc40cf31e47a2336b50311578fc973.tar.bz2 |
[Patch AArch64] GCC 6 regression in vector performance. - Fix vector initialization to happen with lane load instructions.
gcc/
* config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,
always use lane loads to construct non-constant vectors.
gcc/testsuite/
* gcc.target/aarch64/vector_initialization_nostack.c: New.
Co-Authored-By: Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>
From-SVN: r233461
-rw-r--r-- | gcc/ChangeLog | 6 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 56 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 5 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c | 53 |
4 files changed, 92 insertions, 28 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 90d91a7..08b3f0d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,4 +1,10 @@ 2016-02-16 James Greenhalgh <james.greenhalgh@arm.com> + Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> + + * config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor, + always use lane loads to construct non-constant vectors. + +2016-02-16 James Greenhalgh <james.greenhalgh@arm.com> * config/aarch64/aarch64.md (arch64_sqrdml<SQRDMLH_AS:rdma_as>h_lane<mode>): Fix register diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 8326a4c..974a789 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -11053,28 +11053,37 @@ aarch64_simd_make_constant (rtx vals) return NULL_RTX; } +/* Expand a vector initialisation sequence, such that TARGET is + initialised to contain VALS. */ + void aarch64_expand_vector_init (rtx target, rtx vals) { machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); + /* The number of vector elements. */ int n_elts = GET_MODE_NUNITS (mode); + /* The number of vector elements which are not constant. */ int n_var = 0; rtx any_const = NULL_RTX; + /* The first element of vals. */ + rtx v0 = XVECEXP (vals, 0, 0); bool all_same = true; + /* Count the number of variable elements to initialise. */ for (int i = 0; i < n_elts; ++i) { rtx x = XVECEXP (vals, 0, i); - if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x)) + if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x))) ++n_var; else any_const = x; - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) - all_same = false; + all_same &= rtx_equal_p (x, v0); } + /* No variable elements, hand off to aarch64_simd_make_constant which knows + how best to handle this. */ if (n_var == 0) { rtx constant = aarch64_simd_make_constant (vals); @@ -11088,14 +11097,15 @@ aarch64_expand_vector_init (rtx target, rtx vals) /* Splat a single non-constant element if we can. */ if (all_same) { - rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); + rtx x = copy_to_mode_reg (inner_mode, v0); aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x)); return; } - /* Half the fields (or less) are non-constant. Load constant then overwrite - varying fields. Hope that this is more efficient than using the stack. */ - if (n_var <= n_elts/2) + /* Initialise a vector which is part-variable. We want to first try + to build those lanes which are constant in the most efficient way we + can. */ + if (n_var != n_elts) { rtx copy = copy_rtx (vals); @@ -11122,31 +11132,21 @@ aarch64_expand_vector_init (rtx target, rtx vals) XVECEXP (copy, 0, i) = subst; } aarch64_expand_vector_init (target, copy); + } - /* Insert variables. */ - enum insn_code icode = optab_handler (vec_set_optab, mode); - gcc_assert (icode != CODE_FOR_nothing); + /* Insert the variable lanes directly. */ - for (int i = 0; i < n_elts; i++) - { - rtx x = XVECEXP (vals, 0, i); - if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) - continue; - x = copy_to_mode_reg (inner_mode, x); - emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); - } - return; - } + enum insn_code icode = optab_handler (vec_set_optab, mode); + gcc_assert (icode != CODE_FOR_nothing); - /* Construct the vector in memory one field at a time - and load the whole vector. */ - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); for (int i = 0; i < n_elts; i++) - emit_move_insn (adjust_address_nv (mem, inner_mode, - i * GET_MODE_SIZE (inner_mode)), - XVECEXP (vals, 0, i)); - emit_move_insn (target, mem); - + { + rtx x = XVECEXP (vals, 0, i); + if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) + continue; + x = copy_to_mode_reg (inner_mode, x); + emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); + } } static unsigned HOST_WIDE_INT diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 6dd7c2d..cf1da01 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-02-16 James Greenhalgh <james.greenhalgh@arm.com> + Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> + + * gcc.target/aarch64/vector_initialization_nostack.c: New. + 2016-02-16 Jakub Jelinek <jakub@redhat.com> PR middle-end/69801 diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c new file mode 100644 index 0000000..bbad04d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */ +float arr_f[100][100]; +float +f9 (void) +{ + + int i; + float sum = 0; + for (i = 0; i < 100; i++) + sum += arr_f[i][0] * arr_f[0][i]; + return sum; + +} + + +int arr[100][100]; +int +f10 (void) +{ + + int i; + int sum = 0; + for (i = 0; i < 100; i++) + sum += arr[i][0] * arr[0][i]; + return sum; + +} + +double arr_d[100][100]; +double +f11 (void) +{ + int i; + double sum = 0; + for (i = 0; i < 100; i++) + sum += arr_d[i][0] * arr_d[0][i]; + return sum; +} + +char arr_c[100][100]; +char +f12 (void) +{ + int i; + char sum = 0; + for (i = 0; i < 100; i++) + sum += arr_c[i][0] * arr_c[0][i]; + return sum; +} + + +/* { dg-final { scan-assembler-not "sp" } } */ |