diff options
author | Craig Blackmore <craig.blackmore@embecosm.com> | 2024-10-18 09:17:21 -0600 |
---|---|---|
committer | Jeff Law <jlaw@ventanamicro.com> | 2024-10-18 09:17:21 -0600 |
commit | b039d06c9a810a3fab4c5eb9d50b0c7aff94b2d8 (patch) | |
tree | 4467964e4619b45bfb03bcd035273f569b010669 /gcc | |
parent | 212d8685e4590c9f1168f503a383e3ea2639b418 (diff) | |
download | gcc-b039d06c9a810a3fab4c5eb9d50b0c7aff94b2d8.zip gcc-b039d06c9a810a3fab4c5eb9d50b0c7aff94b2d8.tar.gz gcc-b039d06c9a810a3fab4c5eb9d50b0c7aff94b2d8.tar.bz2 |
[PATCH 3/7] RISC-V: Fix vector memcpy smaller LMUL generation
If riscv_vector::expand_block_move is generating a straight-line memcpy
using a predicated store, it tries to use a smaller LMUL to reduce
register pressure if it still allows an entire transfer.
This happens in the inner loop of riscv_vector::expand_block_move,
however, the vmode chosen by this loop gets overwritten later in the
function, so I have added the missing break from the outer loop.
I have also addressed a couple of issues with the conditions of the if
statement within the inner loop.
The first condition did not make sense to me:
```
TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
```
I think this was supposed to be checking that the length fits within the
given LMUL, so I have changed it to do that.
The second condition:
```
/* Avoid loosing the option of using vsetivli . */
&& (nunits <= 31 * lmul || nunits > 31 * 8)
```
seems to imply that lmul affects the range of AVL immediate that
vsetivli can take but I don't think that is correct. Anyway, I don't
think this condition is necessary because if we find a suitable mode we
should stick with it, regardless of whether it allowed vsetivli, rather
than continuing to try larger lmul which would increase register
pressure or smaller potential_ew which would increase AVL. I have
removed this condition.
gcc/ChangeLog:
* config/riscv/riscv-string.cc (expand_block_move): Fix
condition for using smaller LMUL. Break outer loop if a
suitable vmode has been found.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/vsetvl/pr112929-1.c: Expect smaller lmul.
* gcc.target/riscv/rvv/vsetvl/pr112988-1.c: Likewise.
* gcc.target/riscv/rvv/base/cpymem-3.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/riscv/riscv-string.cc | 8 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-3.c | 85 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112929-1.c | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112988-1.c | 2 |
4 files changed, 92 insertions, 5 deletions
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc index 0f1353b..b590c51 100644 --- a/gcc/config/riscv/riscv-string.cc +++ b/gcc/config/riscv/riscv-string.cc @@ -1153,9 +1153,7 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in) Still, by choosing a lower LMUL factor that still allows an entire transfer, we can reduce register pressure. */ for (unsigned lmul = 1; lmul <= 4; lmul <<= 1) - if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT - /* Avoid loosing the option of using vsetivli . */ - && (nunits <= 31 * lmul || nunits > 31 * 8) + if (length * BITS_PER_UNIT <= TARGET_MIN_VLEN * lmul && multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew) && (riscv_vector::get_vector_mode (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul, @@ -1163,6 +1161,10 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in) break; } + /* Stop searching if a suitable vmode has been found. */ + if (vmode != VOIDmode) + break; + /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes wide. BYTES_PER_RISCV_VECTOR can't be evenly divided by the sizes of larger element types; the LMUL factor of 8 can at diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-3.c new file mode 100644 index 0000000..f07078b --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-3.c @@ -0,0 +1,85 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O1 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m8" } */ +/* { dg-add-options riscv_v } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8) + +/* Check that vector memcpy with predicated store uses smaller LMUL where + possible. + +/* m1 +** f1: +** ( +** vsetivli\s+zero,\d+,e8,m1,ta,ma +** | +** li\s+[ta][0-7],\d+ +** vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma +** ) +** vle8.v\s+v\d+,0\(a1\) +** vse8.v\s+v\d+,0\(a0\) +** ret +*/ + +void f1 (char *d, char *s) +{ + __builtin_memcpy (d, s, MIN_VECTOR_BYTES - 1); +} + +/* m2 +** f2: +** ( +** vsetivli\s+zero,\d+,e8,m2,ta,ma +** | +** li\s+[ta][0-7],\d+ +** vsetvli\s+zero,[ta][0-7],e8,m2,ta,ma +** ) +** vle8.v\s+v\d+,0\(a1\) +** vse8.v\s+v\d+,0\(a0\) +** ret +*/ + +void f2 (char *d, char *s) +{ + __builtin_memcpy (d, s, 2 * MIN_VECTOR_BYTES - 1); +} + +/* m4 +** f3: +** ( +** vsetivli\s+zero,\d+,e8,m4,ta,ma +** | +** li\s+[ta][0-7],\d+ +** vsetvli\s+zero,[ta][0-7],e8,m4,ta,ma +** ) +** vle8.v\s+v\d+,0\(a1\) +** vse8.v\s+v\d+,0\(a0\) +** ret +*/ + +void f3 (char *d, char *s) +{ + __builtin_memcpy (d, s, 4 * MIN_VECTOR_BYTES - 1); +} + +/* m8 +** f4: +** ( +** vsetivli\s+zero,\d+,e8,m8,ta,ma +** | +** li\s+[ta][0-7],\d+ +** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma +** | +** li\s+[ta][0-7],\d+ +** addi\s+[ta][0-7],[ta][0-7],-?\d+ +** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma +** ) +** vle8.v\s+v\d+,0\(a1\) +** vse8.v\s+v\d+,0\(a0\) +** ret +*/ + +void f4 (char *d, char *s) +{ + __builtin_memcpy (d, s, 8 * MIN_VECTOR_BYTES - 1); +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112929-1.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112929-1.c index 86d65dd..e55604e 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112929-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112929-1.c @@ -54,5 +54,5 @@ int main() { /* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */ /* { dg-final { scan-assembler-not {vsetivli} } } */ -/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */ +/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m2,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */ /* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112988-1.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112988-1.c index 63817f2..b20e463 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112988-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112988-1.c @@ -64,5 +64,5 @@ int main() { /* { dg-final { scan-assembler-times {vsetvli} 4 } } */ /* { dg-final { scan-assembler-not {vsetivli} } } */ -/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m8,\s*t[au],\s*m[au]} 1 } } */ +/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m2,\s*t[au],\s*m[au]} 1 } } */ /* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 1 } } */ |