aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2021-11-12 17:33:00 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2021-11-12 17:33:00 +0000
commit526e1639aa76b0a8496b0dc3a3ff2c450229544e (patch)
tree4fa1b1080f18d1e86749a97a8dfde1b9617d1bcc
parent48c6cac9caea1dc7c5f50ad3a736f6693e74a11b (diff)
downloadgcc-526e1639aa76b0a8496b0dc3a3ff2c450229544e.zip
gcc-526e1639aa76b0a8496b0dc3a3ff2c450229544e.tar.gz
gcc-526e1639aa76b0a8496b0dc3a3ff2c450229544e.tar.bz2
aarch64: Detect more consecutive MEMs
For tests like: int res[2]; void f1 (int x, int y) { res[0] = res[1] = x + y; } we generated: add w0, w0, w1 adrp x1, .LANCHOR0 add x2, x1, :lo12:.LANCHOR0 str w0, [x1, #:lo12:.LANCHOR0] str w0, [x2, 4] ret Using [x1, #:lo12:.LANCHOR0] for the first store prevented the two stores being recognised as a pair. However, the MEM_EXPR and MEM_OFFSET information tell us that the MEMs really are consecutive. The peehole2 context then guarantees that the first address is equivalent to [x2, 0]. While there: the reg_mentioned_p tests for loads were probably correct, but seemed a bit indirect. We're matching two consecutive loads, so the thing we need to test is that the second MEM in the original sequence doesn't depend on the result of the first load in the original sequence. gcc/ * config/aarch64/aarch64.c: Include tree-dfa.h. (aarch64_check_consecutive_mems): New function that takes MEM_EXPR and MEM_OFFSET into account. (aarch64_swap_ldrstr_operands): Use it. (aarch64_operands_ok_for_ldpstp): Likewise. Check that the address of the second memory doesn't depend on the result of the first load. gcc/testsuite/ * gcc.target/aarch64/stp_1.c: New test.
-rw-r--r--gcc/config/aarch64/aarch64.c156
-rw-r--r--gcc/testsuite/gcc.target/aarch64/stp_1.c29
2 files changed, 133 insertions, 52 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a8f53b8..850288d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -79,6 +79,7 @@
#include "tree-ssa-loop-niter.h"
#include "fractional-cost.h"
#include "rtlanal.h"
+#include "tree-dfa.h"
/* This file should be included last. */
#include "target-def.h"
@@ -24569,6 +24570,97 @@ aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
return priority;
}
+/* Check if *MEM1 and *MEM2 are consecutive memory references and,
+ if they are, try to make them use constant offsets from the same base
+ register. Return true on success. When returning true, set *REVERSED
+ to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
+static bool
+aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
+{
+ *reversed = false;
+ if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
+ || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
+ return false;
+
+ if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
+ return false;
+
+ auto size1 = MEM_SIZE (*mem1);
+ auto size2 = MEM_SIZE (*mem2);
+
+ rtx base1, base2, offset1, offset2;
+ extract_base_offset_in_addr (*mem1, &base1, &offset1);
+ extract_base_offset_in_addr (*mem2, &base2, &offset2);
+
+ /* Make sure at least one memory is in base+offset form. */
+ if (!(base1 && offset1) && !(base2 && offset2))
+ return false;
+
+ /* If both mems already use the same base register, just check the
+ offsets. */
+ if (base1 && base2 && rtx_equal_p (base1, base2))
+ {
+ if (!offset1 || !offset2)
+ return false;
+
+ if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
+ return true;
+
+ if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)))
+ {
+ *reversed = true;
+ return true;
+ }
+
+ return false;
+ }
+
+ /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
+ guarantee that the values are consecutive. */
+ if (MEM_EXPR (*mem1)
+ && MEM_EXPR (*mem2)
+ && MEM_OFFSET_KNOWN_P (*mem1)
+ && MEM_OFFSET_KNOWN_P (*mem2))
+ {
+ poly_int64 expr_offset1;
+ poly_int64 expr_offset2;
+ tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
+ &expr_offset1);
+ tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
+ &expr_offset2);
+ if (!expr_base1
+ || !expr_base2
+ || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
+ return false;
+
+ expr_offset1 += MEM_OFFSET (*mem1);
+ expr_offset2 += MEM_OFFSET (*mem2);
+
+ if (known_eq (expr_offset1 + size1, expr_offset2))
+ ;
+ else if (known_eq (expr_offset2 + size2, expr_offset1))
+ *reversed = true;
+ else
+ return false;
+
+ if (base2)
+ {
+ rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
+ expr_offset1 - expr_offset2);
+ *mem1 = replace_equiv_address_nv (*mem1, addr1);
+ }
+ else
+ {
+ rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
+ expr_offset2 - expr_offset1);
+ *mem2 = replace_equiv_address_nv (*mem2, addr2);
+ }
+ return true;
+ }
+
+ return false;
+}
+
/* Given OPERANDS of consecutive load/store, check if we can merge
them into ldp/stp. LOAD is true if they are load instructions.
MODE is the mode of memory operands. */
@@ -24577,9 +24669,8 @@ bool
aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
machine_mode mode)
{
- HOST_WIDE_INT offval_1, offval_2, msize;
enum reg_class rclass_1, rclass_2;
- rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
+ rtx mem_1, mem_2, reg_1, reg_2;
if (load)
{
@@ -24590,6 +24681,8 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
gcc_assert (REG_P (reg_1) && REG_P (reg_2));
if (REGNO (reg_1) == REGNO (reg_2))
return false;
+ if (reg_overlap_mentioned_p (reg_1, mem_2))
+ return false;
}
else
{
@@ -24613,40 +24706,13 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
return false;
/* Check if the addresses are in the form of [base+offset]. */
- extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
- if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
- return false;
- extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
- if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
- return false;
-
- /* Check if the bases are same. */
- if (!rtx_equal_p (base_1, base_2))
+ bool reversed = false;
+ if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
return false;
/* The operands must be of the same size. */
gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
- GET_MODE_SIZE (GET_MODE (mem_2))));
-
- offval_1 = INTVAL (offset_1);
- offval_2 = INTVAL (offset_2);
- /* We should only be trying this for fixed-sized modes. There is no
- SVE LDP/STP instruction. */
- msize = GET_MODE_SIZE (mode).to_constant ();
- /* Check if the offsets are consecutive. */
- if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
- return false;
-
- /* Check if the addresses are clobbered by load. */
- if (load)
- {
- if (reg_mentioned_p (reg_1, mem_1))
- return false;
-
- /* In increasing order, the last load can clobber the address. */
- if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
- return false;
- }
+ GET_MODE_SIZE (GET_MODE (mem_2))));
/* One of the memory accesses must be a mempair operand.
If it is not the first one, they need to be swapped by the
@@ -24677,27 +24743,13 @@ aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
void
aarch64_swap_ldrstr_operands (rtx* operands, bool load)
{
- rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
- HOST_WIDE_INT offval_1, offval_2;
-
- if (load)
- {
- mem_1 = operands[1];
- mem_2 = operands[3];
- }
- else
- {
- mem_1 = operands[0];
- mem_2 = operands[2];
- }
-
- extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
- extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
-
- offval_1 = INTVAL (offset_1);
- offval_2 = INTVAL (offset_2);
+ int mem_op = load ? 1 : 0;
+ bool reversed = false;
+ if (!aarch64_check_consecutive_mems (operands + mem_op,
+ operands + mem_op + 2, &reversed))
+ gcc_unreachable ();
- if (offval_1 > offval_2)
+ if (reversed)
{
/* Irrespective of whether this is a load or a store,
we do the same swap. */
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_1.c b/gcc/testsuite/gcc.target/aarch64/stp_1.c
new file mode 100644
index 0000000..c1a5e6b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_1.c
@@ -0,0 +1,29 @@
+/* { dg-options "-O -fpeephole2" } */
+
+int res[2];
+
+void
+f1 (int x, int y)
+{
+ res[0] = res[1] = x + y;
+}
+
+void
+f2 (int x, int y, int *res)
+{
+ res[0] = res[1] = x + y;
+}
+
+void
+f3 (int x, int y)
+{
+ res[1] = res[0] = x + y;
+}
+
+void
+f4 (int x, int y, int *res)
+{
+ res[1] = res[0] = x + y;
+}
+
+/* { dg-final { scan-assembler-times {\tstp\t} 4 } } */