diff options
author | Kyrylo Tkachov <kyrylo.tkachov@arm.com> | 2018-06-20 08:57:17 +0000 |
---|---|---|
committer | Kyrylo Tkachov <ktkachov@gcc.gnu.org> | 2018-06-20 08:57:17 +0000 |
commit | 9f5361c8cac181dbc79b7302d7241a61e0ce2386 (patch) | |
tree | 199ba2d587d84ee6fc93df8134cce47cb4793499 /gcc | |
parent | de840bde88a7719fa022f1de6f4e407bf5b4c8a8 (diff) | |
download | gcc-9f5361c8cac181dbc79b7302d7241a61e0ce2386.zip gcc-9f5361c8cac181dbc79b7302d7241a61e0ce2386.tar.gz gcc-9f5361c8cac181dbc79b7302d7241a61e0ce2386.tar.bz2 |
[AArch64] Support for LDP/STP of Q-registers
This patch adds support for generating LDPs and STPs of Q-registers.
This allows for more compact code generation and makes better use of the ISA.
It's implemented in a straightforward way by allowing 16-byte modes in the
sched-fusion machinery and adding appropriate peepholes in aarch64-ldpstp.md
as well as the patterns themselves in aarch64-simd.md.
It adds a new no_ldp_stp_qregs tuning flag.
I use it to restrict the peepholes in aarch64-ldpstp.md from merging the
operations together into PARALLELs. I also use it to restrict the sched fusion
check that brings such loads and stores together. This is enough to avoid
forming the pairs when the tuning flag is set.
I didn't see any non-noise performance effect on SPEC2017 on Cortex-A72 and Cortex-A53.
* config/aarch64/aarch64-tuning-flags.def (no_ldp_stp_qregs): New.
* config/aarch64/aarch64.c (xgene1_tunings): Add
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS to tune_flags.
(aarch64_mode_valid_for_sched_fusion_p):
Allow 16-byte modes.
(aarch64_classify_address): Allow 16-byte modes for load_store_pair_p.
* config/aarch64/aarch64-ldpstp.md: Add peepholes for LDP STP of
128-bit modes.
* config/aarch64/aarch64-simd.md (load_pair<VQ:mode><VQ2:mode>):
New pattern.
(vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
* config/aarch64/iterators.md (VQ2): New mode iterator.
* gcc.target/aarch64/ldp_stp_q.c: New test.
* gcc.target/aarch64/stp_vec_128_1.c: Likewise.
* gcc.target/aarch64/ldp_stp_q_disable.c: Likewise.
From-SVN: r261796
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 15 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-ldpstp.md | 31 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-simd.md | 28 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-tuning-flags.def | 3 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 13 | ||||
-rw-r--r-- | gcc/config/aarch64/iterators.md | 3 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 6 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c | 26 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c | 26 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c | 20 |
10 files changed, 167 insertions, 4 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c263979..ae49015 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2018-06-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com> + + * config/aarch64/aarch64-tuning-flags.def (no_ldp_stp_qregs): New. + * config/aarch64/aarch64.c (xgene1_tunings): Add + AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS to tune_flags. + (aarch64_mode_valid_for_sched_fusion_p): + Allow 16-byte modes. + (aarch64_classify_address): Allow 16-byte modes for load_store_pair_p. + * config/aarch64/aarch64-ldpstp.md: Add peepholes for LDP STP of + 128-bit modes. + * config/aarch64/aarch64-simd.md (load_pair<VQ:mode><VQ2:mode>): + New pattern. + (vec_store_pair<VQ:mode><VQ2:mode>): Likewise. + * config/aarch64/iterators.md (VQ2): New mode iterator. + 2018-06-20 Martin Liska <mliska@suse.cz> * tree-switch-conversion.c (jump_table_cluster::can_be_handled): diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md index 7f1031d..650a80d 100644 --- a/gcc/config/aarch64/aarch64-ldpstp.md +++ b/gcc/config/aarch64/aarch64-ldpstp.md @@ -91,6 +91,37 @@ aarch64_swap_ldrstr_operands (operands, false); }) +(define_peephole2 + [(set (match_operand:VQ 0 "register_operand" "") + (match_operand:VQ 1 "memory_operand" "")) + (set (match_operand:VQ2 2 "register_operand" "") + (match_operand:VQ2 3 "memory_operand" ""))] + "TARGET_SIMD + && aarch64_operands_ok_for_ldpstp (operands, true, <VQ:MODE>mode) + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0" + [(parallel [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))])] +{ + aarch64_swap_ldrstr_operands (operands, true); +}) + +(define_peephole2 + [(set (match_operand:VQ 0 "memory_operand" "") + (match_operand:VQ 1 "register_operand" "")) + (set (match_operand:VQ2 2 "memory_operand" "") + (match_operand:VQ2 3 "register_operand" ""))] + "TARGET_SIMD + && aarch64_operands_ok_for_ldpstp (operands, false, <VQ:MODE>mode) + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0" + [(parallel [(set (match_dup 0) (match_dup 1)) + (set (match_dup 2) (match_dup 3))])] +{ + aarch64_swap_ldrstr_operands (operands, false); +}) + + ;; Handle sign/zero extended consecutive load/store. (define_peephole2 diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index dc4e026..aac5fa1 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -205,6 +205,34 @@ [(set_attr "type" "neon_stp")] ) +(define_insn "load_pair<VQ:mode><VQ2:mode>" + [(set (match_operand:VQ 0 "register_operand" "=w") + (match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump")) + (set (match_operand:VQ2 2 "register_operand" "=w") + (match_operand:VQ2 3 "memory_operand" "m"))] + "TARGET_SIMD + && rtx_equal_p (XEXP (operands[3], 0), + plus_constant (Pmode, + XEXP (operands[1], 0), + GET_MODE_SIZE (<VQ:MODE>mode)))" + "ldp\\t%q0, %q2, %1" + [(set_attr "type" "neon_ldp_q")] +) + +(define_insn "vec_store_pair<VQ:mode><VQ2:mode>" + [(set (match_operand:VQ 0 "aarch64_mem_pair_operand" "=Ump") + (match_operand:VQ 1 "register_operand" "w")) + (set (match_operand:VQ2 2 "memory_operand" "=m") + (match_operand:VQ2 3 "register_operand" "w"))] + "TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0), + plus_constant (Pmode, + XEXP (operands[0], 0), + GET_MODE_SIZE (<VQ:MODE>mode)))" + "stp\\t%q1, %q3, %0" + [(set_attr "type" "neon_stp_q")] +) + + (define_split [(set (match_operand:VQ 0 "register_operand" "") (match_operand:VQ 1 "register_operand" ""))] diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index ea9ead2..fb9700c 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -41,4 +41,7 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW) are not considered cheap. */ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) +/* Disallow load/store pair instructions on Q-registers. */ +AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index bd0ac2f..b88e7ca 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -880,7 +880,7 @@ static const struct tune_params xgene1_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ &generic_prefetch_tune }; @@ -5690,7 +5690,10 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode mode) return mode == SImode || mode == DImode || mode == SFmode || mode == DFmode || (aarch64_vector_mode_supported_p (mode) - && known_eq (GET_MODE_SIZE (mode), 8)); + && (known_eq (GET_MODE_SIZE (mode), 8) + || (known_eq (GET_MODE_SIZE (mode), 16) + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0))); } /* Return true if REGNO is a virtual pointer register, or an eliminable @@ -5847,7 +5850,8 @@ aarch64_classify_address (struct aarch64_address_info *info, if (load_store_pair_p) return ((known_eq (GET_MODE_SIZE (mode), 4) - || known_eq (GET_MODE_SIZE (mode), 8)) + || known_eq (GET_MODE_SIZE (mode), 8) + || known_eq (GET_MODE_SIZE (mode), 16)) && aarch64_offset_7bit_signed_scaled_p (mode, offset)); else return (offset_9bit_signed_unscaled_p (mode, offset) @@ -5907,7 +5911,8 @@ aarch64_classify_address (struct aarch64_address_info *info, if (load_store_pair_p) return ((known_eq (GET_MODE_SIZE (mode), 4) - || known_eq (GET_MODE_SIZE (mode), 8)) + || known_eq (GET_MODE_SIZE (mode), 8) + || known_eq (GET_MODE_SIZE (mode), 16)) && aarch64_offset_7bit_signed_scaled_p (mode, offset)); else return offset_9bit_signed_unscaled_p (mode, offset); diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index dbe1e34..c5ef2ee 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -84,6 +84,9 @@ ;; Quad vector modes. (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) +;; Copy of the above. +(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) + ;; Quad integer vector modes. (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI]) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 4dd987b..a2c5eab 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2018-06-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com> + + * gcc.target/aarch64/ldp_stp_q.c: New test. + * gcc.target/aarch64/stp_vec_128_1.c: Likewise. + * gcc.target/aarch64/ldp_stp_q_disable.c: Likewise. + 2018-06-20 Martin Liska <mliska@suse.cz> * gcc.dg/tree-ssa/vrp104.c: Grep just for GIMPLE IL. diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c new file mode 100644 index 0000000..b4f2586 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q.c @@ -0,0 +1,26 @@ +/* { dg-options "-O2 -moverride=tune=none" } */ + +typedef float float32x4_t __attribute__ ((__vector_size__ ((16)))); + +float32x4_t arr[4][4]; + +void +foo (float32x4_t x, float32x4_t y) +{ + arr[0][1] = x; + arr[1][0] = y; + arr[2][0] = x; + arr[1][1] = y; + arr[0][2] = x; + arr[0][3] = y; + arr[1][2] = x; + arr[2][1] = y; + arr[3][0] = x; + arr[3][1] = y; + arr[2][2] = x; + arr[1][3] = y; + arr[2][3] = x; + arr[3][2] = y; +} + +/* { dg-final { scan-assembler-times "stp\tq\[0-9\]+, q\[0-9\]" 7 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c new file mode 100644 index 0000000..38c1870 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_q_disable.c @@ -0,0 +1,26 @@ +/* { dg-options "-O2 -moverride=tune=no_ldp_stp_qregs" } */ + +typedef float float32x4_t __attribute__ ((__vector_size__ ((16)))); + +float32x4_t arr[4][4]; + +void +foo (float32x4_t x, float32x4_t y) +{ + arr[0][1] = x; + arr[1][0] = y; + arr[2][0] = x; + arr[1][1] = y; + arr[0][2] = x; + arr[0][3] = y; + arr[1][2] = x; + arr[2][1] = y; + arr[3][0] = x; + arr[3][1] = y; + arr[2][2] = x; + arr[1][3] = y; + arr[2][3] = x; + arr[3][2] = y; +} + +/* { dg-final { scan-assembler-not "stp\tq\[0-9\]+, q\[0-9\]" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c new file mode 100644 index 0000000..7d8d54e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_128_1.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -moverride=tune=none" } */ + + +typedef int int32x4_t __attribute__ ((__vector_size__ ((16)))); + +void +bar (int32x4_t *foo) +{ + int i = 0; + int32x4_t val = { 3, 2, 5, 1 }; + + for (i = 0; i < 256; i+=2) + { + foo[i] = val; + foo[i+1] = val; + } +} + +/* { dg-final { scan-assembler "stp\tq\[0-9\]+, q\[0-9\]" } } */ |