diff options
author | Cui, Lili <lili.cui@intel.com> | 2024-11-26 15:10:23 +0800 |
---|---|---|
committer | Cui, Lili <lili.cui@intel.com> | 2024-11-26 15:14:43 +0800 |
commit | 60b708a9c878aff9a76ec0d446ae63e6527327a6 (patch) | |
tree | 245336e7de82c341c760062f24c0d52e5b19729b /gcc | |
parent | efb1d2e2368e60da3c691ee3cb510ee690d1fa2a (diff) | |
download | gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.zip gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.tar.gz gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.tar.bz2 |
Optimize 128-bit vector permutation with pand, pandn and por.
This patch introduces a new subroutine in ix86_expand_vec_perm_const_1.
On x86, use mixed constant permutation for V8HImode and V16QImode when
SSE2 is supported. This patch handles certain vector shuffle operations
more efficiently using pand, pandn, and por. This change is intended to
improve assembly code generation for configurations that support SSE2.
gcc/ChangeLog:
PR target/116675
* config/i386/i386-expand.cc (expand_vec_perm_pand_pandn_por):
New subroutine.
(ix86_expand_vec_perm_const_1): Call expand_vec_perm_pand_pandn_por.
gcc/testsuite/ChangeLog:
PR target/116675
* gcc.target/i386/pr116675.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386-expand.cc | 50 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr116675.c | 75 |
2 files changed, 125 insertions, 0 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index f8dcce4..2eb6197 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -23102,6 +23102,53 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) return true; } +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a + permutation (which is a bland) with and, andnot and or when pshufb is not available. + + It handles case: + __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15); + __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15); + + An element[i] must be chosen between op0[i] and op1[i] to satisfy the + requirement. + */ + +static bool +expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d) +{ + rtx rperm[16], vperm; + unsigned int i, nelt = d->nelt; + + if (!TARGET_SSE2 + || d->one_operand_p + || (d->vmode != V16QImode && d->vmode != V8HImode)) + return false; + + if (d->perm[0] != 0) + return false; + + /* The dest[i] must select an element between op0[i] and op1[i]. */ + for (i = 1; i < nelt; i++) + if ((d->perm[i] % nelt) != i) + return false; + + if (d->testing_p) + return true; + + /* Generates a blend mask for the operators AND and ANDNOT. */ + machine_mode inner_mode = GET_MODE_INNER (d->vmode); + for (i = 0; i < nelt; i++) + rperm[i] = (d->perm[i] < nelt) ? CONSTM1_RTX (inner_mode) + : CONST0_RTX (inner_mode); + + vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm)); + vperm = force_reg (d->vmode, vperm); + + ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1); + + return true; +} + /* Implement permutation with pslldq + psrldq + por when pshufb is not available. */ static bool @@ -24161,6 +24208,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_psrlw_psllw_por (d)) return true; + if (expand_vec_perm_pand_pandn_por (d)) + return true; + /* Try sequences of four instructions. */ if (expand_vec_perm_even_odd_trunc (d)) diff --git a/gcc/testsuite/gcc.target/i386/pr116675.c b/gcc/testsuite/gcc.target/i386/pr116675.c new file mode 100644 index 0000000..e463dd8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr116675.c @@ -0,0 +1,75 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2 -mno-ssse3" } */ +/* { dg-final { scan-assembler-times "pand" 4 } } */ +/* { dg-final { scan-assembler-times "pandn" 4 } } */ +/* { dg-final { scan-assembler-times "por" 4 } } */ + +#include <emmintrin.h> + +__attribute__((noinline, noclone, target("sse2"))) +static __v8hi foo1 (__v8hi a, __v8hi b) +{ + return __builtin_shufflevector (a, b, 0, 9, 2, 11, 4, 13, 6, 15); +} + +__attribute__((noinline, noclone, target("sse2"))) +static __v8hi foo2 (__v8hi a, __v8hi b) +{ + return __builtin_shufflevector (a, b, 8, 9, 2, 3, 4, 13, 14, 15); +} + +__attribute__((noinline, noclone, target("sse2"))) +static __v16qi foo3 (__v16qi a, __v16qi b) +{ + return __builtin_shufflevector (a, b, 0, 17, 2, 19, 4, 21, 6, 23, + 8, 25, 10, 27, 12, 29, 14, 31); +} + +__attribute__((noinline, noclone, target("sse2"))) +static __v16qi foo4 (__v16qi a, __v16qi b) +{ + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 21, 6, 23, + 8, 25, 10, 27,12,29,14,31); +} + +__attribute__((noinline, noclone)) void +compare_v8hi (__v8hi a, __v8hi b) +{ + for (int i = 0; i < 8; i++) + if (a[i] != b[i]) + __builtin_abort (); +} + +__attribute__((noinline, noclone)) void +compare_v16qi (__v16qi a, __v16qi b) +{ + for (int i = 0; i < 16; i++) + if (a[i] != b[i]) + __builtin_abort (); +} + +int main (void) +{ + __v8hi s1, s2, s3, s4, s5, s6; + __v16qi s7, s8, s9, s10, s11, s12; + s1 = (__v8hi) {0, 1, 2, 3, 4, 5, 6, 7}; + s2 = (__v8hi) {8, 9, 10, 11, 12, 13, 14, 15}; + s7 = (__v16qi) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + s8 = (__v16qi) {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + + s3 = foo1 (s1, s2); + s4 = foo2 (s1, s2); + s9 = foo3 (s7, s8); + s10 = foo4 (s7, s8); + + s5 = (__v8hi) {0, 9, 2, 11, 4, 13, 6, 15}; + s6 = (__v8hi) {8, 9, 2, 3, 4, 13, 14, 15}; + s11 = (__v16qi) {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31}; + s12 = (__v16qi) {0, 1, 2, 3, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31}; + + compare_v8hi (s3, s5); + compare_v8hi (s4, s6); + compare_v16qi (s9, s11); + compare_v16qi (s10, s12); + return 0; +} |