aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorAndre Vieira <andre.simoesdiasvieira@arm.com>2024-11-29 10:18:57 +0000
committerAndre Vieira <andre.simoesdiasvieira@arm.com>2024-11-29 15:54:44 +0000
commitf42fd8e9335354f986d69b92ab66be07cc31bc7a (patch)
treec4f6abb3d01d95a1f8ed79e3e61dc3be71301b24 /gcc
parent15bd62513acf802966fd04f58f4ada84a6d2a7fd (diff)
downloadgcc-f42fd8e9335354f986d69b92ab66be07cc31bc7a.zip
gcc-f42fd8e9335354f986d69b92ab66be07cc31bc7a.tar.gz
gcc-f42fd8e9335354f986d69b92ab66be07cc31bc7a.tar.bz2
arm, mve: Detect uses of vctp_vpr_generated inside subregs
Address a problem we were having where we were missing on detecting uses of vctp_vpr_generated in the analysis for 'arm_attempt_dlstp_transform' because the use was inside a SUBREG and rtx_equal_p does not catch that. Using reg_overlap_mentioned_p is much more robust. gcc/ChangeLog: PR target/117814 * config/arm/arm.cc (arm_attempt_dlstp_transform): Use reg_overlap_mentioned_p instead of rtx_equal_p to detect uses of vctp_vpr_generated inside subregs. gcc/testsuite/ChangeLog: PR target/117814 * gcc.target/arm/mve/dlstp-invalid-asm.c (test10): Renamed to... (test10a): ... this. (test10b): Variation of test10a with a small change to trigger wrong codegen.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/arm/arm.cc3
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c37
2 files changed, 37 insertions, 3 deletions
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 7292fdd..7f82fb9 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -35847,7 +35847,8 @@ arm_attempt_dlstp_transform (rtx label)
df_ref insn_uses = NULL;
FOR_EACH_INSN_USE (insn_uses, insn)
{
- if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses)))
+ if (reg_overlap_mentioned_p (vctp_vpr_generated,
+ DF_REF_REG (insn_uses)))
{
end_sequence ();
return 1;
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
index 26df2d3..eb0782e 100644
--- a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
@@ -127,8 +127,15 @@ void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
}
}
-/* Using a VPR that gets re-generated within the loop. */
-void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+/* Using a VPR that gets re-generated within the loop. Even though we
+ currently reject such loops, it would be possible to dlstp transform this
+ specific loop, as long as we make sure that the first vldrwq_z mask would
+ either:
+ * remain the same as its mask in the first iteration,
+ * become the same as the loop mask after the first iteration,
+ * become all ones, since the dlstp would then mask it the same as the loop
+ mask. */
+void test10a (int32_t *a, int32_t *b, int32_t *c, int n)
{
mve_pred16_t p = vctp32q (n);
while (n > 0)
@@ -145,6 +152,32 @@ void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
}
}
+/* Using a VPR that gets re-generated within the loop, the difference between
+ this test and test10a is to make sure the two vctp calls are never the same,
+ this leads to slightly different codegen in some cases triggering the issue
+ in a different way. This loop too would be OK to dlstp transform as long
+ as we made sure that the first vldrwq_z mask would either:
+ * remain the same as the its mask in the first iteration,
+ * become the same as the loop mask after the first iteration,
+ * become all ones, since the dlstp would then mask it the same as the loop
+ mask. */
+void test10b (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ mve_pred16_t p = vctp32q (n-4);
+ while (n > 0)
+ {
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ p = vctp32q (n);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
/* Using vctp32q_m instead of vctp32q. */
void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
{