Optimize 128-bit vector permutation with pand, pandn and por.

This patch introduces a new subroutine in ix86_expand_vec_perm_const_1. On x86, use mixed constant permutation for V8HImode and V16QImode when SSE2 is supported. This patch handles certain vector shuffle operations more efficiently using pand, pandn, and por. This change is intended to improve assembly code generation for configurations that support SSE2. gcc/ChangeLog: PR target/116675 * config/i386/i386-expand.cc (expand_vec_perm_pand_pandn_por): New subroutine. (ix86_expand_vec_perm_const_1): Call expand_vec_perm_pand_pandn_por. gcc/testsuite/ChangeLog: PR target/116675 * gcc.target/i386/pr116675.c: New test.
author: Cui, Lili <lili.cui@intel.com> 2024-11-26 15:10:23 +0800
committer: Cui, Lili <lili.cui@intel.com> 2024-11-26 15:14:43 +0800
commit: 60b708a9c878aff9a76ec0d446ae63e6527327a6 (patch)
tree: 245336e7de82c341c760062f24c0d52e5b19729b /gcc
parent: efb1d2e2368e60da3c691ee3cb510ee690d1fa2a (diff)
download: gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.zip
gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.tar.gz
gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.tar.bz2
2 files changed, 125 insertions, 0 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index f8dcce4..2eb6197 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -23102,6 +23102,53 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
+   permutation (which is a bland) with and, andnot and or when pshufb is not available.
+
+   It handles case:
+   __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
+   __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
+
+   An element[i] must be chosen between op0[i] and op1[i] to satisfy the
+   requirement.
+ */
+
+static bool
+expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
+{
+  rtx rperm[16], vperm;
+  unsigned int i, nelt = d->nelt;
+
+  if (!TARGET_SSE2
+      || d->one_operand_p
+      || (d->vmode != V16QImode && d->vmode != V8HImode))
+    return false;
+
+  if (d->perm[0] != 0)
+    return false;
+
+  /* The dest[i] must select an element between op0[i] and op1[i].  */
+  for (i = 1; i < nelt; i++)
+    if ((d->perm[i] % nelt) != i)
+      return false;
+
+  if (d->testing_p)
+     return true;
+
+  /* Generates a blend mask for the operators AND and ANDNOT.  */
+  machine_mode inner_mode = GET_MODE_INNER (d->vmode);
+  for (i = 0; i < nelt; i++)
+    rperm[i] = (d->perm[i] <  nelt) ? CONSTM1_RTX (inner_mode)
+      : CONST0_RTX (inner_mode);
+
+  vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
+  vperm = force_reg (d->vmode, vperm);
+
+  ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
+
+  return true;
+}
+
 /* Implement permutation with pslldq + psrldq + por when pshufb is not
    available.  */
 static bool
@@ -24161,6 +24208,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_psrlw_psllw_por (d))
     return true;
 
+  if (expand_vec_perm_pand_pandn_por (d))
+    return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr116675.c b/gcc/testsuite/gcc.target/i386/pr116675.c
new file mode 100644
index 0000000..e463dd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116675.c
@@ -0,0 +1,75 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2 -mno-ssse3" } */
+/* { dg-final { scan-assembler-times "pand" 4 } } */
+/* { dg-final { scan-assembler-times "pandn" 4 } } */
+/* { dg-final { scan-assembler-times "por" 4 } } */
+
+#include <emmintrin.h>
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v8hi foo1 (__v8hi a, __v8hi b)
+{
+  return __builtin_shufflevector (a, b, 0, 9, 2, 11, 4, 13, 6, 15);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v8hi foo2 (__v8hi a, __v8hi b)
+{
+  return __builtin_shufflevector (a, b, 8, 9, 2, 3, 4, 13, 14, 15);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v16qi foo3 (__v16qi a, __v16qi b)
+{
+  return __builtin_shufflevector (a, b, 0, 17, 2, 19, 4, 21, 6, 23,
+			          8, 25, 10, 27, 12, 29, 14, 31);
+}
+
+__attribute__((noinline, noclone, target("sse2")))
+static __v16qi foo4 (__v16qi a, __v16qi b)
+{
+  return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 21, 6, 23,
+                                        8, 25, 10, 27,12,29,14,31);
+}
+
+__attribute__((noinline, noclone)) void
+compare_v8hi (__v8hi a,  __v8hi b)
+{
+  for (int i = 0; i < 8; i++) 
+    if (a[i] != b[i]) 
+      __builtin_abort ();
+}
+
+__attribute__((noinline, noclone)) void
+compare_v16qi (__v16qi a,  __v16qi b)
+{
+  for (int i = 0; i < 16; i++)
+    if (a[i] != b[i])
+      __builtin_abort ();
+}
+
+int main (void)
+{
+  __v8hi s1, s2, s3, s4, s5, s6;
+  __v16qi s7, s8, s9, s10, s11, s12;
+  s1 = (__v8hi) {0, 1, 2, 3, 4, 5, 6, 7};
+  s2 = (__v8hi) {8, 9, 10, 11, 12, 13, 14, 15};
+  s7 = (__v16qi) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  s8 = (__v16qi) {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+  s3  = foo1 (s1, s2);
+  s4  = foo2 (s1, s2);
+  s9  = foo3 (s7, s8);
+  s10 = foo4 (s7, s8);
+
+  s5 = (__v8hi) {0, 9, 2, 11, 4, 13, 6, 15};
+  s6 = (__v8hi) {8, 9, 2, 3, 4, 13, 14, 15};
+  s11 = (__v16qi) {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31};
+  s12 = (__v16qi) {0, 1, 2, 3, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31};
+
+  compare_v8hi (s3, s5);
+  compare_v8hi (s4, s6);
+  compare_v16qi (s9, s11);
+  compare_v16qi (s10, s12);
+  return 0;
+}
author	Cui, Lili <lili.cui@intel.com>	2024-11-26 15:10:23 +0800
committer	Cui, Lili <lili.cui@intel.com>	2024-11-26 15:14:43 +0800
commit	60b708a9c878aff9a76ec0d446ae63e6527327a6 (patch)
tree	245336e7de82c341c760062f24c0d52e5b19729b /gcc
parent	efb1d2e2368e60da3c691ee3cb510ee690d1fa2a (diff)
download	gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.zip gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.tar.gz gcc-60b708a9c878aff9a76ec0d446ae63e6527327a6.tar.bz2