diff options
author | Kugan Vivekanandarajah <kvivekananda@nvidia.com> | 2024-10-31 07:23:10 +1100 |
---|---|---|
committer | Kugan Vivekanandarajah <kvivekananda@nvidia.com> | 2024-10-31 07:23:16 +1100 |
commit | acba8b3d8dec0124c8b3a7e112b3a784a5091214 (patch) | |
tree | c61a8c40f9bca86c66fff80e92ddd3dc6ad4a7d4 /gcc | |
parent | 5be5c66071b407a767856b8fa300ede54fcf11b4 (diff) | |
download | gcc-acba8b3d8dec0124c8b3a7e112b3a784a5091214.zip gcc-acba8b3d8dec0124c8b3a7e112b3a784a5091214.tar.gz gcc-acba8b3d8dec0124c8b3a7e112b3a784a5091214.tar.bz2 |
[PATCH] Fix SLP when ifcvt versioned loop is not vectorized
When ifcvt version a loop, it sets dont_vectorize to the scalar loop. If the
vector loop is not vectorized and removed, the scalar loop is still left with
dont_vectorize. As a result, BB vectorization will not happen.
This patch resets dont_vectorize to scalar loop when IFN_LOOP_VECTORIZED
is set to false.
gcc/ChangeLog:
* tree-vectorizer.cc (pass_vectorize::execute): Reset dont_vectorize
to scalar loop when setting IFN_LOOP_VECTORIZED to false.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/bb-slp-77.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/bb-slp-77.c | 74 | ||||
-rw-r--r-- | gcc/tree-vectorizer.cc | 2 |
2 files changed, 76 insertions, 0 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-77.c b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c new file mode 100644 index 0000000..b2cc1d1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c @@ -0,0 +1,74 @@ + +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +#include <stdint.h> +#include <string.h> + + +typedef struct { + uint16_t d; + uint16_t m; + uint8_t val1[4]; + uint8_t val2[16]; +} st1; + +typedef struct { + float d; + float s; + int8_t val2[32]; +} st2; + +float table[1 << 16]; + +inline static float foo(uint16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return table[s]; +} + + +void test(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int nb = n / 32; + + + const st1 * restrict x = vx; + const st2 * restrict y = vy; + + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t val1; + memcpy(&val1, x[i].val1, sizeof(val1)); + + int sumi0 = 0; + int sumi1 = 0; + + if (val1) { + for (int j = 0; j < 16; ++j) { + const uint8_t xh_0 = ((val1 >> (j)) << 4) & 0x10; + const uint8_t xh_1 = ((val1 >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[i].val2[j] & 0xF) | xh_0; + const int32_t x1 = (x[i].val2[j] >> 4) | xh_1; + + sumi0 += (x0 * y[i].val2[j]); + sumi1 += (x1 * y[i].val2[j + 16]); + } + } else { + for (int j = 0; j < 16; ++j) { + const int32_t x0 = (x[i].val2[j] & 0xF); + const int32_t x1 = (x[i].val2[j] >> 4); + + sumi0 += (x0 * y[i].val2[j]); + sumi1 += (x1 * y[i].val2[j + 16]); + } + } + + int sumi = sumi0 + sumi1; + sumf += (foo(x[i].d)*y[i].d)*sumi + foo(x[i].m)*y[i].s; + } + + *s = sumf; +} + +/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1" { target { { vect_int_mult && vect_element_align } && { ! powerpc*-*-* } } } } } */ diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc index af112f2..16fa0ec 100644 --- a/gcc/tree-vectorizer.cc +++ b/gcc/tree-vectorizer.cc @@ -1326,6 +1326,7 @@ pass_vectorize::execute (function *fun) if (g) { fold_loop_internal_call (g, boolean_false_node); + loop->dont_vectorize = false; ret |= TODO_cleanup_cfg; g = NULL; } @@ -1335,6 +1336,7 @@ pass_vectorize::execute (function *fun) if (g) { fold_loop_internal_call (g, boolean_false_node); + loop->dont_vectorize = false; ret |= TODO_cleanup_cfg; } } |