aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/ChangeLog17
-rw-r--r--gcc/config/i386/sse.md13
-rw-r--r--gcc/doc/md.texi8
-rw-r--r--gcc/optabs-query.c5
-rw-r--r--gcc/optabs.c76
-rw-r--r--gcc/optabs.def1
-rw-r--r--gcc/testsuite/ChangeLog14
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-simd-10.c96
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-simd-8.c4
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-simd-9.c4
-rw-r--r--gcc/testsuite/gcc.target/i386/avx2-vect-simd-10.c16
-rw-r--r--gcc/testsuite/gcc.target/i386/avx2-vect-simd-8.c16
-rw-r--r--gcc/testsuite/gcc.target/i386/avx2-vect-simd-9.c16
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-vect-simd-10.c16
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-vect-simd-8.c16
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-vect-simd-9.c16
-rw-r--r--gcc/testsuite/gcc.target/i386/sse2-vect-simd-10.c15
-rw-r--r--gcc/testsuite/gcc.target/i386/sse2-vect-simd-8.c16
-rw-r--r--gcc/testsuite/gcc.target/i386/sse2-vect-simd-9.c16
-rw-r--r--gcc/tree-vect-generic.c26
-rw-r--r--gcc/tree-vect-stmts.c149
21 files changed, 493 insertions, 63 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 7d337bc..922d1b5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,22 @@
2019-06-19 Jakub Jelinek <jakub@redhat.com>
+ * doc/md.texi: Document vec_shl_<mode> pattern.
+ * optabs.def (vec_shl_optab): New optab.
+ * optabs.c (shift_amt_for_vec_perm_mask): Add shift_optab
+ argument, if == vec_shl_optab, check for left whole vector shift
+ pattern rather than right shift.
+ (expand_vec_perm_const): Add vec_shl_optab support.
+ * optabs-query.c (can_vec_perm_var_p): Mention also vec_shl optab
+ in the comment.
+ * tree-vect-generic.c (lower_vec_perm): Support permutations which
+ can be handled by vec_shl_optab.
+ * tree-vect-stmts.c (scan_store_can_perm_p): New function.
+ (check_scan_store): Use it.
+ (vectorizable_scan_store): If target can't do normal permutations,
+ try to use whole vector left shifts and if needed a VEC_COND_EXPR
+ after it.
+ * config/i386/sse.md (vec_shl_<mode>): New expander.
+
* omp-low.c (lower_rec_input_clauses): Handle references properly
in inscan clauses.
(lower_omp_scan): Likewise.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5d8ada4..26309ae 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -11758,6 +11758,19 @@
(set_attr "mode" "<sseinsnmode>")])
+(define_expand "vec_shl_<mode>"
+ [(set (match_dup 3)
+ (ashift:V1TI
+ (match_operand:VI_128 1 "register_operand")
+ (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
+ (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
+ "TARGET_SSE2"
+{
+ operands[1] = gen_lowpart (V1TImode, operands[1]);
+ operands[3] = gen_reg_rtx (V1TImode);
+ operands[4] = gen_lowpart (<MODE>mode, operands[3]);
+})
+
(define_expand "vec_shr_<mode>"
[(set (match_dup 3)
(lshiftrt:V1TI
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 40c2b8b..b45b4be 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5459,6 +5459,14 @@ in operand 2. Store the result in vector output operand 0. Operands
0 and 1 have mode @var{m} and operand 2 has the mode appropriate for
one element of @var{m}.
+@cindex @code{vec_shl_@var{m}} instruction pattern
+@item @samp{vec_shl_@var{m}}
+Whole vector left shift in bits, i.e.@: away from element 0.
+Operand 1 is a vector to be shifted.
+Operand 2 is an integer shift amount in bits.
+Operand 0 is where the resulting shifted vector is stored.
+The output and input vectors should have the same modes.
+
@cindex @code{vec_shr_@var{m}} instruction pattern
@item @samp{vec_shr_@var{m}}
Whole vector right shift in bits, i.e.@: towards element 0.
diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
index 04c8d08..4116bfe 100644
--- a/gcc/optabs-query.c
+++ b/gcc/optabs-query.c
@@ -415,8 +415,9 @@ can_vec_perm_var_p (machine_mode mode)
permute (if the target supports that).
Note that additional permutations representing whole-vector shifts may
- also be handled via the vec_shr optab, but only where the second input
- vector is entirely constant zeroes; this case is not dealt with here. */
+ also be handled via the vec_shr or vec_shl optab, but only where the
+ second input vector is entirely constant zeroes; this case is not dealt
+ with here. */
bool
can_vec_perm_const_p (machine_mode mode, const vec_perm_indices &sel,
diff --git a/gcc/optabs.c b/gcc/optabs.c
index a0e361b..5a718e7 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -5444,19 +5444,45 @@ vector_compare_rtx (machine_mode cmp_mode, enum tree_code tcode,
}
/* Check if vec_perm mask SEL is a constant equivalent to a shift of
- the first vec_perm operand, assuming the second operand is a constant
- vector of zeros. Return the shift distance in bits if so, or NULL_RTX
- if the vec_perm is not a shift. MODE is the mode of the value being
- shifted. */
+ the first vec_perm operand, assuming the second operand (for left shift
+ first operand) is a constant vector of zeros. Return the shift distance
+ in bits if so, or NULL_RTX if the vec_perm is not a shift. MODE is the
+ mode of the value being shifted. SHIFT_OPTAB is vec_shr_optab for right
+ shift or vec_shl_optab for left shift. */
static rtx
-shift_amt_for_vec_perm_mask (machine_mode mode, const vec_perm_indices &sel)
+shift_amt_for_vec_perm_mask (machine_mode mode, const vec_perm_indices &sel,
+ optab shift_optab)
{
unsigned int bitsize = GET_MODE_UNIT_BITSIZE (mode);
poly_int64 first = sel[0];
if (maybe_ge (sel[0], GET_MODE_NUNITS (mode)))
return NULL_RTX;
- if (!sel.series_p (0, 1, first, 1))
+ if (shift_optab == vec_shl_optab)
+ {
+ unsigned int nelt;
+ if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
+ return NULL_RTX;
+ unsigned firstidx = 0;
+ for (unsigned int i = 0; i < nelt; i++)
+ {
+ if (known_eq (sel[i], nelt))
+ {
+ if (i == 0 || firstidx)
+ return NULL_RTX;
+ firstidx = i;
+ }
+ else if (firstidx
+ ? maybe_ne (sel[i], nelt + i - firstidx)
+ : maybe_ge (sel[i], nelt))
+ return NULL_RTX;
+ }
+
+ if (firstidx == 0)
+ return NULL_RTX;
+ first = firstidx;
+ }
+ else if (!sel.series_p (0, 1, first, 1))
{
unsigned int nelt;
if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
@@ -5544,25 +5570,37 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
target instruction. */
vec_perm_indices indices (sel, 2, GET_MODE_NUNITS (mode));
- /* See if this can be handled with a vec_shr. We only do this if the
- second vector is all zeroes. */
- insn_code shift_code = optab_handler (vec_shr_optab, mode);
- insn_code shift_code_qi = ((qimode != VOIDmode && qimode != mode)
- ? optab_handler (vec_shr_optab, qimode)
- : CODE_FOR_nothing);
-
- if (v1 == CONST0_RTX (GET_MODE (v1))
- && (shift_code != CODE_FOR_nothing
- || shift_code_qi != CODE_FOR_nothing))
+ /* See if this can be handled with a vec_shr or vec_shl. We only do this
+ if the second (for vec_shr) or first (for vec_shl) vector is all
+ zeroes. */
+ insn_code shift_code = CODE_FOR_nothing;
+ insn_code shift_code_qi = CODE_FOR_nothing;
+ optab shift_optab = unknown_optab;
+ rtx v2 = v0;
+ if (v1 == CONST0_RTX (GET_MODE (v1)))
+ shift_optab = vec_shr_optab;
+ else if (v0 == CONST0_RTX (GET_MODE (v0)))
+ {
+ shift_optab = vec_shl_optab;
+ v2 = v1;
+ }
+ if (shift_optab != unknown_optab)
+ {
+ shift_code = optab_handler (shift_optab, mode);
+ shift_code_qi = ((qimode != VOIDmode && qimode != mode)
+ ? optab_handler (shift_optab, qimode)
+ : CODE_FOR_nothing);
+ }
+ if (shift_code != CODE_FOR_nothing || shift_code_qi != CODE_FOR_nothing)
{
- rtx shift_amt = shift_amt_for_vec_perm_mask (mode, indices);
+ rtx shift_amt = shift_amt_for_vec_perm_mask (mode, indices, shift_optab);
if (shift_amt)
{
struct expand_operand ops[3];
if (shift_code != CODE_FOR_nothing)
{
create_output_operand (&ops[0], target, mode);
- create_input_operand (&ops[1], v0, mode);
+ create_input_operand (&ops[1], v2, mode);
create_convert_operand_from_type (&ops[2], shift_amt, sizetype);
if (maybe_expand_insn (shift_code, 3, ops))
return ops[0].value;
@@ -5571,7 +5609,7 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
{
rtx tmp = gen_reg_rtx (qimode);
create_output_operand (&ops[0], tmp, qimode);
- create_input_operand (&ops[1], gen_lowpart (qimode, v0), qimode);
+ create_input_operand (&ops[1], gen_lowpart (qimode, v2), qimode);
create_convert_operand_from_type (&ops[2], shift_amt, sizetype);
if (maybe_expand_insn (shift_code_qi, 3, ops))
return gen_lowpart (mode, ops[0].value);
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 75c8a0ae..feee96f 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -349,6 +349,7 @@ OPTAB_D (vec_packu_float_optab, "vec_packu_float_$a")
OPTAB_D (vec_perm_optab, "vec_perm$a")
OPTAB_D (vec_realign_load_optab, "vec_realign_load_$a")
OPTAB_D (vec_set_optab, "vec_set$a")
+OPTAB_D (vec_shl_optab, "vec_shl_$a")
OPTAB_D (vec_shr_optab, "vec_shr_$a")
OPTAB_D (vec_unpack_sfix_trunc_hi_optab, "vec_unpack_sfix_trunc_hi_$a")
OPTAB_D (vec_unpack_sfix_trunc_lo_optab, "vec_unpack_sfix_trunc_lo_$a")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index ad8c1ac..63711af 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,19 @@
2019-06-19 Jakub Jelinek <jakub@redhat.com>
+ * gcc.dg/vect/vect-simd-8.c: If main is defined, don't include
+ tree-vect.h nor call check_vect.
+ * gcc.dg/vect/vect-simd-9.c: Likewise.
+ * gcc.dg/vect/vect-simd-10.c: New test.
+ * gcc.target/i386/sse2-vect-simd-8.c: New test.
+ * gcc.target/i386/sse2-vect-simd-9.c: New test.
+ * gcc.target/i386/sse2-vect-simd-10.c: New test.
+ * gcc.target/i386/avx2-vect-simd-8.c: New test.
+ * gcc.target/i386/avx2-vect-simd-9.c: New test.
+ * gcc.target/i386/avx2-vect-simd-10.c: New test.
+ * gcc.target/i386/avx512f-vect-simd-8.c: New test.
+ * gcc.target/i386/avx512f-vect-simd-9.c: New test.
+ * gcc.target/i386/avx512f-vect-simd-10.c: New test.
+
* g++.dg/vect/simd-3.cc: New test.
* g++.dg/vect/simd-4.cc: New test.
* g++.dg/vect/simd-5.cc: New test.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-10.c b/gcc/testsuite/gcc.dg/vect/vect-simd-10.c
new file mode 100644
index 0000000..d442d6b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-10.c
@@ -0,0 +1,96 @@
+/* { dg-require-effective-target size32plus } */
+/* { dg-additional-options "-fopenmp-simd" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */
+
+#ifndef main
+#include "tree-vect.h"
+#endif
+
+float r = 1.0f, a[1024], b[1024];
+
+__attribute__((noipa)) void
+foo (float *a, float *b)
+{
+ #pragma omp simd reduction (inscan, *:r)
+ for (int i = 0; i < 1024; i++)
+ {
+ r *= a[i];
+ #pragma omp scan inclusive(r)
+ b[i] = r;
+ }
+}
+
+__attribute__((noipa)) float
+bar (void)
+{
+ float s = -__builtin_inff ();
+ #pragma omp simd reduction (inscan, max:s)
+ for (int i = 0; i < 1024; i++)
+ {
+ s = s > a[i] ? s : a[i];
+ #pragma omp scan inclusive(s)
+ b[i] = s;
+ }
+ return s;
+}
+
+int
+main ()
+{
+ float s = 1.0f;
+#ifndef main
+ check_vect ();
+#endif
+ for (int i = 0; i < 1024; ++i)
+ {
+ if (i < 80)
+ a[i] = (i & 1) ? 0.25f : 0.5f;
+ else if (i < 200)
+ a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f;
+ else if (i < 280)
+ a[i] = (i & 1) ? 0.25f : 0.5f;
+ else if (i < 380)
+ a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f;
+ else
+ switch (i % 6)
+ {
+ case 0: a[i] = 0.25f; break;
+ case 1: a[i] = 2.0f; break;
+ case 2: a[i] = -1.0f; break;
+ case 3: a[i] = -4.0f; break;
+ case 4: a[i] = 0.5f; break;
+ case 5: a[i] = 1.0f; break;
+ default: a[i] = 0.0f; break;
+ }
+ b[i] = -19.0f;
+ asm ("" : "+g" (i));
+ }
+ foo (a, b);
+ if (r * 16384.0f != 0.125f)
+ abort ();
+ float m = -175.25f;
+ for (int i = 0; i < 1024; ++i)
+ {
+ s *= a[i];
+ if (b[i] != s)
+ abort ();
+ else
+ {
+ a[i] = m - ((i % 3) == 1 ? 2.0f : (i % 3) == 2 ? 4.0f : 0.0f);
+ b[i] = -231.75f;
+ m += 0.75f;
+ }
+ }
+ if (bar () != 592.0f)
+ abort ();
+ s = -__builtin_inff ();
+ for (int i = 0; i < 1024; ++i)
+ {
+ if (s < a[i])
+ s = a[i];
+ if (b[i] != s)
+ abort ();
+ }
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-8.c b/gcc/testsuite/gcc.dg/vect/vect-simd-8.c
index 123a201..5d10ad9 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-8.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-8.c
@@ -3,7 +3,9 @@
/* { dg-additional-options "-mavx" { target avx_runtime } } */
/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */
+#ifndef main
#include "tree-vect.h"
+#endif
int r, a[1024], b[1024];
@@ -63,7 +65,9 @@ int
main ()
{
int s = 0;
+#ifndef main
check_vect ();
+#endif
for (int i = 0; i < 1024; ++i)
{
a[i] = i;
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-9.c b/gcc/testsuite/gcc.dg/vect/vect-simd-9.c
index ce5ae57..52eb24f 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-9.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-9.c
@@ -3,7 +3,9 @@
/* { dg-additional-options "-mavx" { target avx_runtime } } */
/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */
+#ifndef main
#include "tree-vect.h"
+#endif
int r, a[1024], b[1024];
@@ -65,7 +67,9 @@ int
main ()
{
int s = 0;
+#ifndef main
check_vect ();
+#endif
for (int i = 0; i < 1024; ++i)
{
a[i] = i;
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-simd-10.c b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-10.c
new file mode 100644
index 0000000..d912351
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-10.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+
+#include "avx2-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-10.c"
+
+static void
+avx2_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-simd-8.c b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-8.c
new file mode 100644
index 0000000..8edd4e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-8.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+
+#include "avx2-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-8.c"
+
+static void
+avx2_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-simd-9.c b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-9.c
new file mode 100644
index 0000000..ba1a3e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-9.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+
+#include "avx2-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-9.c"
+
+static void
+avx2_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-10.c b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-10.c
new file mode 100644
index 0000000..c0d7cdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-10.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+
+#include "avx512f-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-10.c"
+
+static void
+avx512f_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-8.c b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-8.c
new file mode 100644
index 0000000..f469a13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-8.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+
+#include "avx512f-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-8.c"
+
+static void
+avx512f_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-9.c b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-9.c
new file mode 100644
index 0000000..1e8f5e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-9.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+
+#include "avx512f-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-9.c"
+
+static void
+avx512f_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse2-vect-simd-10.c b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-10.c
new file mode 100644
index 0000000..3cc182a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-10.c
@@ -0,0 +1,15 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target sse2 } */
+
+#include "sse2-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-10.c"
+
+static void
+sse2_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse2-vect-simd-8.c b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-8.c
new file mode 100644
index 0000000..7c7aad8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-8.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+
+#include "sse2-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-8.c"
+
+static void
+sse2_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse2-vect-simd-9.c b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-9.c
new file mode 100644
index 0000000..0fdff41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-9.c
@@ -0,0 +1,16 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */
+
+#include "sse2-check.h"
+
+#define main() do_main ()
+
+#include "../../gcc.dg/vect/vect-simd-9.c"
+
+static void
+sse2_test (void)
+{
+ do_main ();
+}
diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
index e9f5505..4a53fc4 100644
--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@@ -1367,6 +1367,32 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
return;
}
}
+ /* And similarly vec_shl pattern. */
+ if (optab_handler (vec_shl_optab, TYPE_MODE (vect_type))
+ != CODE_FOR_nothing
+ && TREE_CODE (vec0) == VECTOR_CST
+ && initializer_zerop (vec0))
+ {
+ unsigned int first = 0;
+ for (i = 0; i < elements; ++i)
+ if (known_eq (poly_uint64 (indices[i]), elements))
+ {
+ if (i == 0 || first)
+ break;
+ first = i;
+ }
+ else if (first
+ ? maybe_ne (poly_uint64 (indices[i]),
+ elements + i - first)
+ : maybe_ge (poly_uint64 (indices[i]), elements))
+ break;
+ if (i == elements)
+ {
+ gimple_assign_set_rhs3 (stmt, mask);
+ update_stmt (stmt);
+ return;
+ }
+ }
}
else if (can_vec_perm_var_p (TYPE_MODE (vect_type)))
return;
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 790b49b..98a5f3e 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -6356,6 +6356,71 @@ scan_operand_equal_p (tree ref1, tree ref2)
/* Function check_scan_store.
+ Verify if we can perform the needed permutations or whole vector shifts.
+ Return -1 on failure, otherwise exact log2 of vectype's nunits. */
+
+static int
+scan_store_can_perm_p (tree vectype, tree init, int *use_whole_vector_p = NULL)
+{
+ enum machine_mode vec_mode = TYPE_MODE (vectype);
+ unsigned HOST_WIDE_INT nunits;
+ if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
+ return -1;
+ int units_log2 = exact_log2 (nunits);
+ if (units_log2 <= 0)
+ return -1;
+
+ int i;
+ for (i = 0; i <= units_log2; ++i)
+ {
+ unsigned HOST_WIDE_INT j, k;
+ vec_perm_builder sel (nunits, nunits, 1);
+ sel.quick_grow (nunits);
+ if (i == 0)
+ {
+ for (j = 0; j < nunits; ++j)
+ sel[j] = nunits - 1;
+ }
+ else
+ {
+ for (j = 0; j < (HOST_WIDE_INT_1U << (i - 1)); ++j)
+ sel[j] = j;
+ for (k = 0; j < nunits; ++j, ++k)
+ sel[j] = nunits + k;
+ }
+ vec_perm_indices indices (sel, i == 0 ? 1 : 2, nunits);
+ if (!can_vec_perm_const_p (vec_mode, indices))
+ break;
+ }
+
+ if (i == 0)
+ return -1;
+
+ if (i <= units_log2)
+ {
+ if (optab_handler (vec_shl_optab, vec_mode) == CODE_FOR_nothing)
+ return -1;
+ int kind = 1;
+ /* Whole vector shifts shift in zeros, so if init is all zero constant,
+ there is no need to do anything further. */
+ if ((TREE_CODE (init) != INTEGER_CST
+ && TREE_CODE (init) != REAL_CST)
+ || !initializer_zerop (init))
+ {
+ tree masktype = build_same_sized_truth_vector_type (vectype);
+ if (!expand_vec_cond_expr_p (vectype, masktype, VECTOR_CST))
+ return -1;
+ kind = 2;
+ }
+ if (use_whole_vector_p)
+ *use_whole_vector_p = kind;
+ }
+ return units_log2;
+}
+
+
+/* Function check_scan_store.
+
Check magic stores for #pragma omp scan {in,ex}clusive reductions. */
static bool
@@ -6596,34 +6661,9 @@ check_scan_store (stmt_vec_info stmt_info, tree vectype,
if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
goto fail;
- unsigned HOST_WIDE_INT nunits;
- if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
+ int units_log2 = scan_store_can_perm_p (vectype, *init);
+ if (units_log2 == -1)
goto fail;
- int units_log2 = exact_log2 (nunits);
- if (units_log2 <= 0)
- goto fail;
-
- for (int i = 0; i <= units_log2; ++i)
- {
- unsigned HOST_WIDE_INT j, k;
- vec_perm_builder sel (nunits, nunits, 1);
- sel.quick_grow (nunits);
- if (i == units_log2)
- {
- for (j = 0; j < nunits; ++j)
- sel[j] = nunits - 1;
- }
- else
- {
- for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
- sel[j] = nunits + j;
- for (k = 0; j < nunits; ++j, ++k)
- sel[j] = k;
- }
- vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
- if (!can_vec_perm_const_p (vec_mode, indices))
- goto fail;
- }
return true;
}
@@ -6686,7 +6726,8 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
unsigned HOST_WIDE_INT nunits;
if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
gcc_unreachable ();
- int units_log2 = exact_log2 (nunits);
+ int use_whole_vector_p = 0;
+ int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector_p);
gcc_assert (units_log2 > 0);
auto_vec<tree, 16> perms;
perms.quick_grow (units_log2 + 1);
@@ -6696,21 +6737,25 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
vec_perm_builder sel (nunits, nunits, 1);
sel.quick_grow (nunits);
if (i == units_log2)
- {
- for (j = 0; j < nunits; ++j)
- sel[j] = nunits - 1;
- }
- else
- {
- for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
- sel[j] = nunits + j;
- for (k = 0; j < nunits; ++j, ++k)
- sel[j] = k;
- }
+ for (j = 0; j < nunits; ++j)
+ sel[j] = nunits - 1;
+ else
+ {
+ for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
+ sel[j] = j;
+ for (k = 0; j < nunits; ++j, ++k)
+ sel[j] = nunits + k;
+ }
vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
- perms[i] = vect_gen_perm_mask_checked (vectype, indices);
+ if (use_whole_vector_p && i < units_log2)
+ perms[i] = vect_gen_perm_mask_any (vectype, indices);
+ else
+ perms[i] = vect_gen_perm_mask_checked (vectype, indices);
}
+ tree zero_vec = use_whole_vector_p ? build_zero_cst (vectype) : NULL_TREE;
+ tree masktype = (use_whole_vector_p == 2
+ ? build_same_sized_truth_vector_type (vectype) : NULL_TREE);
stmt_vec_info prev_stmt_info = NULL;
tree vec_oprnd1 = NULL_TREE;
tree vec_oprnd2 = NULL_TREE;
@@ -6742,8 +6787,9 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
for (int i = 0; i < units_log2; ++i)
{
tree new_temp = make_ssa_name (vectype);
- gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR, v,
- vec_oprnd1, perms[i]);
+ gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR,
+ zero_vec ? zero_vec : vec_oprnd1, v,
+ perms[i]);
new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
if (prev_stmt_info == NULL)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
@@ -6751,6 +6797,25 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
prev_stmt_info = new_stmt_info;
+ if (use_whole_vector_p == 2)
+ {
+ /* Whole vector shift shifted in zero bits, but if *init
+ is not initializer_zerop, we need to replace those elements
+ with elements from vec_oprnd1. */
+ tree_vector_builder vb (masktype, nunits, 1);
+ for (unsigned HOST_WIDE_INT k = 0; k < nunits; ++k)
+ vb.quick_push (k < (HOST_WIDE_INT_1U << i)
+ ? boolean_false_node : boolean_true_node);
+
+ tree new_temp2 = make_ssa_name (vectype);
+ g = gimple_build_assign (new_temp2, VEC_COND_EXPR, vb.build (),
+ new_temp, vec_oprnd1);
+ new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
+ prev_stmt_info = new_stmt_info;
+ new_temp = new_temp2;
+ }
+
tree new_temp2 = make_ssa_name (vectype);
g = gimple_build_assign (new_temp2, code, v, new_temp);
new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);