diff options
author | Tamar Christina <tamar.christina@arm.com> | 2024-07-05 12:10:39 +0100 |
---|---|---|
committer | Tamar Christina <tamar.christina@arm.com> | 2024-07-05 12:10:39 +0100 |
commit | 97fcfeac3dcc433b792711fd840b92fa3e860733 (patch) | |
tree | 048d7dd2a3b8f4aa30282ce6e07ccdb3a1e26df0 | |
parent | 6ff698106644af39da9e0eda51974fdcd111280d (diff) | |
download | gcc-97fcfeac3dcc433b792711fd840b92fa3e860733.zip gcc-97fcfeac3dcc433b792711fd840b92fa3e860733.tar.gz gcc-97fcfeac3dcc433b792711fd840b92fa3e860733.tar.bz2 |
AArch64: lower 2 reg TBL permutes with one zero register to 1 reg TBL.
When a two reg TBL is performed with one operand being a zero vector we can
instead use a single reg TBL and map the indices for accessing the zero vector
to an out of range constant.
On AArch64 out of range indices into a TBL have a defined semantics of setting
the element to zero. Many uArches have a slower 2-reg TBL than 1-reg TBL.
Before this change we had:
typedef unsigned int v4si __attribute__ ((vector_size (16)));
v4si f1 (v4si a)
{
v4si zeros = {0,0,0,0};
return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
}
which generates:
f1:
mov v30.16b, v0.16b
movi v31.4s, 0
adrp x0, .LC0
ldr q0, [x0, #:lo12:.LC0]
tbl v0.16b, {v30.16b - v31.16b}, v0.16b
ret
.LC0:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 20
.byte 21
.byte 22
.byte 23
.byte 4
.byte 5
.byte 6
.byte 7
.byte 24
.byte 25
.byte 26
.byte 27
and with the patch:
f1:
adrp x0, .LC0
ldr q31, [x0, #:lo12:.LC0]
tbl v0.16b, {v0.16b}, v31.16b
ret
.LC0:
.byte 0
.byte 1
.byte 2
.byte 3
.byte -1
.byte -1
.byte -1
.byte -1
.byte 4
.byte 5
.byte 6
.byte 7
.byte -1
.byte -1
.byte -1
.byte -1
This sequence is generated often by openmp and aside from the
strict performance impact of this change, it also gives better
register allocation as we no longer have the consecutive
register limitation.
gcc/ChangeLog:
* config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0_p
and zero_op_p1.
(aarch64_evpc_tbl): Implement register value remapping.
(aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup
before it's forced to a reg.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/tbl_with_zero_1.c: New test.
* gcc.target/aarch64/tbl_with_zero_2.c: New test.
-rw-r--r-- | gcc/config/aarch64/aarch64.cc | 40 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c | 40 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c | 20 |
3 files changed, 94 insertions, 6 deletions
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 469eb93..7f0cc47 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25413,6 +25413,7 @@ struct expand_vec_perm_d unsigned int vec_flags; unsigned int op_vec_flags; bool one_vector_p; + bool zero_op0_p, zero_op1_p; bool testing_p; }; @@ -25909,13 +25910,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d) /* to_constant is safe since this routine is specific to Advanced SIMD vectors. */ unsigned int nelt = d->perm.length ().to_constant (); + + /* If one register is the constant vector of 0 then we only need + a one reg TBL and we map any accesses to the vector of 0 to -1. We can't + do this earlier since vec_perm_indices clamps elements to within range so + we can only do it during codegen. */ + if (d->zero_op0_p) + d->op0 = d->op1; + else if (d->zero_op1_p) + d->op1 = d->op0; + for (unsigned int i = 0; i < nelt; ++i) - /* If big-endian and two vectors we end up with a weird mixed-endian - mode on NEON. Reverse the index within each word but not the word - itself. to_constant is safe because we checked is_constant above. */ - rperm[i] = GEN_INT (BYTES_BIG_ENDIAN - ? d->perm[i].to_constant () ^ (nelt - 1) - : d->perm[i].to_constant ()); + { + auto val = d->perm[i].to_constant (); + + /* If we're selecting from a 0 vector, we can just use an out of range + index instead. */ + if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt)) + rperm[i] = constm1_rtx; + else + { + /* If we are remapping a zero register as the first parameter we need + to adjust the indices of the non-zero register. */ + if (d->zero_op0_p) + val = val % nelt; + + /* If big-endian and two vectors we end up with a weird mixed-endian + mode on NEON. Reverse the index within each word but not the word + itself. to_constant is safe because we checked is_constant + above. */ + rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val); + } + } sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); sel = force_reg (vmode, sel); @@ -26179,6 +26205,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, else d.one_vector_p = false; + d.zero_op0_p = op0 == CONST0_RTX (op_mode); + d.zero_op1_p = op1 == CONST0_RTX (op_mode); d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, sel.nelts_per_input ()); d.vmode = vmode; diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c new file mode 100644 index 0000000..5595127 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O1" } */ + +typedef unsigned int v4si __attribute__ ((vector_size (16))); + +v4si f1 (v4si a) +{ + v4si zeros = {0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); +} + +typedef unsigned short v8hi __attribute__ ((vector_size (16))); + +v8hi f2a (v8hi a) +{ + v8hi zeros = {0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12); +} + +v8hi f2b (v8hi a) +{ + v8hi zeros = {0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8); +} + +typedef unsigned char v16qi __attribute__ ((vector_size (16))); + +v16qi f3a (v16qi a) +{ + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24); +} + +v16qi f3b (v16qi a) +{ + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12); +} + +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c new file mode 100644 index 0000000..e7d5a67 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target le } */ +/* { dg-additional-options "-O1" } */ + +typedef unsigned int v4si __attribute__ ((vector_size (16))); + +v4si f1 (v4si a) +{ + v4si zeros = {0,0,0,0}; + return __builtin_shufflevector (zeros, a, 0, 5, 1, 6); +} + +v4si f2 (v4si a) +{ + v4si zeros = {0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); +} + +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 2 } } */ +/* { dg-final { scan-assembler-times {(\.byte\s+-1\n\s+){4}(\.byte\s+[4-7]+\n\s+){4}(\.byte\s+-1\n\s+){4}(\.byte\s+(8|9|10|11)+\n?\s*){4}} 1 } } */ |