aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorCui, Lili <lili.cui@intel.com>2024-11-26 15:10:23 +0800
committerCui, Lili <lili.cui@intel.com>2024-11-26 15:14:43 +0800
commit60b708a9c878aff9a76ec0d446ae63e6527327a6 (patch)
tree245336e7de82c341c760062f24c0d52e5b19729b /gcc
parentefb1d2e2368e60da3c691ee3cb510ee690d1fa2a (diff)
downloadgcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.zip
gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.tar.gz
gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.tar.bz2
Optimize 128-bit vector permutation with pand, pandn and por.
This patch introduces a new subroutine in ix86_expand_vec_perm_const_1. On x86, use mixed constant permutation for V8HImode and V16QImode when SSE2 is supported. This patch handles certain vector shuffle operations more efficiently using pand, pandn, and por. This change is intended to improve assembly code generation for configurations that support SSE2. gcc/ChangeLog: PR target/116675 * config/i386/i386-expand.cc (expand_vec_perm_pand_pandn_por): New subroutine. (ix86_expand_vec_perm_const_1): Call expand_vec_perm_pand_pandn_por. gcc/testsuite/ChangeLog: PR target/116675 * gcc.target/i386/pr116675.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386-expand.cc50
-rw-r--r--gcc/testsuite/gcc.target/i386/pr116675.c75
2 files changed, 125 insertions, 0 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index f8dcce4..2eb6197 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23102,6 +23102,53 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
return true;
}
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
+ permutation (which is a bland) with and, andnot and or when pshufb is not available.
+
+ It handles case:
+ __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
+ __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
+
+ An element[i] must be chosen between op0[i] and op1[i] to satisfy the
+ requirement.
+ */
+
+static bool
+expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
+{
+ rtx rperm[16], vperm;
+ unsigned int i, nelt = d->nelt;
+
+ if (!TARGET_SSE2
+ || d->one_operand_p
+ || (d->vmode != V16QImode && d->vmode != V8HImode))
+ return false;
+
+ if (d->perm[0] != 0)
+ return false;
+
+ /* The dest[i] must select an element between op0[i] and op1[i]. */
+ for (i = 1; i < nelt; i++)
+ if ((d->perm[i] % nelt) != i)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ /* Generates a blend mask for the operators AND and ANDNOT. */
+ machine_mode inner_mode = GET_MODE_INNER (d->vmode);
+ for (i = 0; i < nelt; i++)
+ rperm[i] = (d->perm[i] < nelt) ? CONSTM1_RTX (inner_mode)
+ : CONST0_RTX (inner_mode);
+
+ vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
+ vperm = force_reg (d->vmode, vperm);
+
+ ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
+
+ return true;
+}
+
/* Implement permutation with pslldq + psrldq + por when pshufb is not
available. */
static bool
@@ -24161,6 +24208,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_psrlw_psllw_por (d))
return true;
+ if (expand_vec_perm_pand_pandn_por (d))
+ return true;
+
/* Try sequences of four instructions. */
if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr116675.c b/gcc/testsuite/gcc.target/i386/pr116675.c
new file mode 100644
index 0000000..e463dd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116675.c
@@ -0,0 +1,75 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2 -mno-ssse3" } */
+/* { dg-final { scan-assembler-times "pand" 4 } } */
+/* { dg-final { scan-assembler-times "pandn" 4 } } */
+/* { dg-final { scan-assembler-times "por" 4 } } */
+
+#include <emmintrin.h>
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v8hi foo1 (__v8hi a, __v8hi b)
+{
+ return __builtin_shufflevector (a, b, 0, 9, 2, 11, 4, 13, 6, 15);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v8hi foo2 (__v8hi a, __v8hi b)
+{
+ return __builtin_shufflevector (a, b, 8, 9, 2, 3, 4, 13, 14, 15);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v16qi foo3 (__v16qi a, __v16qi b)
+{
+ return __builtin_shufflevector (a, b, 0, 17, 2, 19, 4, 21, 6, 23,
+ 8, 25, 10, 27, 12, 29, 14, 31);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v16qi foo4 (__v16qi a, __v16qi b)
+{
+ return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 21, 6, 23,
+ 8, 25, 10, 27,12,29,14,31);
+}
+
+__attribute__((noinline, noclone)) void
+compare_v8hi (__v8hi a, __v8hi b)
+{
+ for (int i = 0; i < 8; i++)
+ if (a[i] != b[i])
+ __builtin_abort ();
+}
+
+__attribute__((noinline, noclone)) void
+compare_v16qi (__v16qi a, __v16qi b)
+{
+ for (int i = 0; i < 16; i++)
+ if (a[i] != b[i])
+ __builtin_abort ();
+}
+
+int main (void)
+{
+ __v8hi s1, s2, s3, s4, s5, s6;
+ __v16qi s7, s8, s9, s10, s11, s12;
+ s1 = (__v8hi) {0, 1, 2, 3, 4, 5, 6, 7};
+ s2 = (__v8hi) {8, 9, 10, 11, 12, 13, 14, 15};
+ s7 = (__v16qi) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+ s8 = (__v16qi) {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+ s3 = foo1 (s1, s2);
+ s4 = foo2 (s1, s2);
+ s9 = foo3 (s7, s8);
+ s10 = foo4 (s7, s8);
+
+ s5 = (__v8hi) {0, 9, 2, 11, 4, 13, 6, 15};
+ s6 = (__v8hi) {8, 9, 2, 3, 4, 13, 14, 15};
+ s11 = (__v16qi) {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31};
+ s12 = (__v16qi) {0, 1, 2, 3, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31};
+
+ compare_v8hi (s3, s5);
+ compare_v8hi (s4, s6);
+ compare_v16qi (s9, s11);
+ compare_v16qi (s10, s12);
+ return 0;
+}