aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2019-10-07 08:36:06 +0000
committerRichard Sandiford <rsandifo@gcc.gnu.org>2019-10-07 08:36:06 +0000
commit698114484e668abf28c70d989bc93fa399dda9ac (patch)
treefd1f0dfc8c900a11da102a8bbfba388877795b9c
parent806bdf4e40d31cf55744c876eb9f17654de36b99 (diff)
downloadgcc-698114484e668abf28c70d989bc93fa399dda9ac.zip
gcc-698114484e668abf28c70d989bc93fa399dda9ac.tar.gz
gcc-698114484e668abf28c70d989bc93fa399dda9ac.tar.bz2
[i386] Make the vzeroupper pattern describe its effects (PR91994)
The problem in this PR was that vzeroupper has an effect on register contents, but those effects weren't modelled in the rtl pattern, which was just an unspec_volatile. This patch fixes that by running a subpass after vzeroupper insertion to add SETs and CLOBBERs as appropriate. See the comments in the patch for more details. 2019-10-07 Richard Sandiford <richard.sandiford@arm.com> gcc/ PR target/91994 * config/i386/sse.md (avx_vzeroupper): Turn into a define_expand and wrap the unspec_volatile in a parallel. (*avx_vzeroupper): New define_insn. Use a match_parallel around the unspec_volatile. * config/i386/predicates.md (vzeroupper_pattern): Expect the unspec_volatile to be wrapped in a parallel. * config/i386/i386-features.c (ix86_add_reg_usage_to_vzeroupper) (ix86_add_reg_usage_to_vzerouppers): New functions. (rest_of_handle_insert_vzeroupper): Use them to add register usage information to the vzeroupper instructions. gcc/testsuite/ PR target/91994 * gcc.target/i386/pr91994.c: New test. From-SVN: r276648
-rw-r--r--gcc/ChangeLog14
-rw-r--r--gcc/config/i386/i386-features.c63
-rw-r--r--gcc/config/i386/predicates.md5
-rw-r--r--gcc/config/i386/sse.md13
-rw-r--r--gcc/testsuite/ChangeLog5
-rw-r--r--gcc/testsuite/gcc.target/i386/pr91994.c35
6 files changed, 130 insertions, 5 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index ef1eb59..ef8731f 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,17 @@
+2019-10-07 Richard Sandiford <richard.sandiford@arm.com>
+
+ PR target/91994
+ * config/i386/sse.md (avx_vzeroupper): Turn into a define_expand
+ and wrap the unspec_volatile in a parallel.
+ (*avx_vzeroupper): New define_insn. Use a match_parallel around
+ the unspec_volatile.
+ * config/i386/predicates.md (vzeroupper_pattern): Expect the
+ unspec_volatile to be wrapped in a parallel.
+ * config/i386/i386-features.c (ix86_add_reg_usage_to_vzeroupper)
+ (ix86_add_reg_usage_to_vzerouppers): New functions.
+ (rest_of_handle_insert_vzeroupper): Use them to add register
+ usage information to the vzeroupper instructions.
+
2019-10-07 Richard Biener <rguenther@suse.de>
PR tree-optimization/91975
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 9b297ba..4781a33 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -1757,6 +1757,68 @@ convert_scalars_to_vector (bool timode_p)
return 0;
}
+/* Modify the vzeroupper pattern in INSN so that it describes the effect
+ that the instruction has on the SSE registers. LIVE_REGS are the set
+ of registers that are live across the instruction.
+
+ For a live register R we use:
+
+ (set (reg:V2DF R) (reg:V2DF R))
+
+ which preserves the low 128 bits but clobbers the upper bits.
+ For a dead register we just use:
+
+ (clobber (reg:V2DF R))
+
+ which invalidates any previous contents of R and stops R from becoming
+ live across the vzeroupper in future. */
+
+static void
+ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs)
+{
+ rtx pattern = PATTERN (insn);
+ unsigned int nregs = TARGET_64BIT ? 16 : 8;
+ rtvec vec = rtvec_alloc (nregs + 1);
+ RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0);
+ for (unsigned int i = 0; i < nregs; ++i)
+ {
+ unsigned int regno = GET_SSE_REGNO (i);
+ rtx reg = gen_rtx_REG (V2DImode, regno);
+ if (bitmap_bit_p (live_regs, regno))
+ RTVEC_ELT (vec, i + 1) = gen_rtx_SET (reg, reg);
+ else
+ RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg);
+ }
+ XVEC (pattern, 0) = vec;
+ df_insn_rescan (insn);
+}
+
+/* Walk the vzeroupper instructions in the function and annotate them
+ with the effect that they have on the SSE registers. */
+
+static void
+ix86_add_reg_usage_to_vzerouppers (void)
+{
+ basic_block bb;
+ rtx_insn *insn;
+ auto_bitmap live_regs;
+
+ df_analyze ();
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ bitmap_copy (live_regs, df_get_live_out (bb));
+ df_simulate_initialize_backwards (bb, live_regs);
+ FOR_BB_INSNS_REVERSE (bb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+ if (vzeroupper_pattern (PATTERN (insn), VOIDmode))
+ ix86_add_reg_usage_to_vzeroupper (insn, live_regs);
+ df_simulate_one_insn_backwards (bb, insn, live_regs);
+ }
+ }
+}
+
static unsigned int
rest_of_handle_insert_vzeroupper (void)
{
@@ -1773,6 +1835,7 @@ rest_of_handle_insert_vzeroupper (void)
/* Call optimize_mode_switching. */
g->get_passes ()->execute_pass_mode_switching ();
+ ix86_add_reg_usage_to_vzerouppers ();
return 0;
}
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 72f8e7e..31f1cea 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1441,8 +1441,9 @@
;; return true if OP is a vzeroupper pattern.
(define_predicate "vzeroupper_pattern"
- (and (match_code "unspec_volatile")
- (match_test "XINT (op, 1) == UNSPECV_VZEROUPPER")))
+ (and (match_code "parallel")
+ (match_code "unspec_volatile" "a")
+ (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER")))
;; Return true if OP is an addsub vec_merge operation
(define_predicate "addsub_vm_operator"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c7f539f..07922a1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -19622,9 +19622,16 @@
(set_attr "mode" "OI")])
;; Clear the upper 128bits of AVX registers, equivalent to a NOP
-;; if the upper 128bits are unused.
-(define_insn "avx_vzeroupper"
- [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)]
+;; if the upper 128bits are unused. Initially we expand the instructions
+;; as though they had no effect on the SSE registers, but later add SETs and
+;; CLOBBERs to the PARALLEL to model the real effect.
+(define_expand "avx_vzeroupper"
+ [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
+ "TARGET_AVX")
+
+(define_insn "*avx_vzeroupper"
+ [(match_parallel 0 "vzeroupper_pattern"
+ [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
"TARGET_AVX"
"vzeroupper"
[(set_attr "type" "sse")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index e8e0060..01253be 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2019-10-07 Richard Sandiford <richard.sandiford@arm.com>
+
+ PR target/91994
+ * gcc.target/i386/pr91994.c: New test.
+
2019-10-07 Richard Biener <rguenther@suse.de>
PR tree-optimization/91975
diff --git a/gcc/testsuite/gcc.target/i386/pr91994.c b/gcc/testsuite/gcc.target/i386/pr91994.c
new file mode 100644
index 0000000..033be68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91994.c
@@ -0,0 +1,35 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O2 -mavx -mvzeroupper" } */
+
+#include "avx-check.h"
+
+#include <immintrin.h>
+
+__m256i x1, x2, x3;
+
+__attribute__ ((noinline))
+static void
+foo (void)
+{
+ x1 = x2;
+}
+
+void
+bar (void)
+{
+ __m256i x = x1;
+ foo ();
+ x3 = x;
+}
+
+__attribute__ ((noinline))
+void
+avx_test (void)
+{
+ __m256i x = _mm256_set1_epi8 (3);
+ x1 = x;
+ bar ();
+ if (__builtin_memcmp (&x3, &x, sizeof (x)))
+ __builtin_abort ();
+}