11 files changed, 595 insertions, 14 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 12cec61..ef6c12c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -27034,6 +27034,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
   return target;
 }
 
+/* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
+
+static const uint64_t matrix_ashift[8] =
+{
+  0,
+  0x0001020408102040, /* 1 l */
+  0x0000010204081020, /* 2 l */
+  0x0000000102040810, /* 3 l */
+  0x0000000001020408, /* 4 l */
+  0x0000000000010204, /* 5 l */
+  0x0000000000000102, /* 6 l */
+  0x0000000000000001  /* 7 l */
+};
+
+static const uint64_t matrix_lshiftrt[8] =
+{
+  0,
+  0x0204081020408000, /* 1 r */
+  0x0408102040800000, /* 2 r */
+  0x0810204080000000, /* 3 r */
+  0x1020408000000000, /* 4 r */
+  0x2040800000000000, /* 5 r */
+  0x4080000000000000, /* 6 r */
+  0x8000000000000000  /* 7 r */
+};
+
+static const uint64_t matrix_ashiftrt[8] =
+{
+  0,
+  0x0204081020408080, /* 1 r */
+  0x0408102040808080, /* 2 r */
+  0x0810204080808080, /* 3 r */
+  0x1020408080808080, /* 4 r */
+  0x2040808080808080, /* 5 r */
+  0x4080808080808080, /* 6 r */
+  0x8080808080808080  /* 7 r */
+};
+
+static const uint64_t matrix_rotate[8] =
+{
+  0,
+  0x8001020408102040, /* 1 rol8 */
+  0x4080010204081020, /* 2 rol8 */
+  0x2040800102040810, /* 3 rol8 */
+  0x1020408001020408, /* 4 rol8 */
+  0x0810204080010204, /* 5 rol8 */
+  0x0408102040800102, /* 6 rol8 */
+  0x0204081020408001  /* 7 rol8 */
+};
+
+static const uint64_t matrix_rotatert[8] =
+{
+  0,
+  0x0204081020408001, /* 1 ror8 */
+  0x0408102040800102, /* 2 ror8 */
+  0x0810204080010204, /* 3 ror8 */
+  0x1020408001020408, /* 4 ror8 */
+  0x2040800102040810, /* 5 ror8 */
+  0x4080010204081020, /* 6 ror8 */
+  0x8001020408102040  /* 7 ror8 */
+};
+
+/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
+   for CODE and shift count COUNT into register with vector of size of SRC.  */
+
+rtx
+ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
+{
+  machine_mode mode = GET_MODE (src);
+  const uint64_t *matrix;
+  unsigned shift = INTVAL (count) & 7;
+  gcc_assert (shift > 0 && shift < 8);
+
+  switch (code)
+    {
+    case ASHIFT:
+      matrix = matrix_ashift;
+      break;
+    case ASHIFTRT:
+      matrix = matrix_ashiftrt;
+      break;
+    case LSHIFTRT:
+      matrix = matrix_lshiftrt;
+      break;
+    case ROTATE:
+      matrix = matrix_rotate;
+      break;
+    case ROTATERT:
+      matrix = matrix_rotatert;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  int nelts = GET_MODE_NUNITS (mode);
+  rtvec vec = rtvec_alloc (nelts);
+  uint64_t ma = matrix[shift];
+  for (int i = 0; i < nelts; i++)
+    RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
+
+  return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
+}
+
 /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
 
 void
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index ee6b78b..bdb8bb9 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap);
 /* In i386-expand.cc.  */
 bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
 				   HOST_WIDE_INT*);
+rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 55c9b16..b4b84b9 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22102,6 +22102,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    }
 	  /* FALLTHRU */
 	case V32QImode:
+	  if (TARGET_GFNI && constant_op1)
+	    {
+	      /* Use vgf2p8affine. One extra load for the mask, but in a loop
+		 with enough registers it will be moved out. So for now don't
+		 account the constant mask load. This is not quite right
+		 for non loop vectorization.  */
+	      extra = 0;
+	      return ix86_vec_cost (mode, cost->sse_op) + extra;
+	    }
 	  if (TARGET_AVX2)
 	    /* Use vpbroadcast.  */
 	    extra = cost->sse_op;
@@ -22136,6 +22145,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    count = 9;
 	  return ix86_vec_cost (mode, cost->sse_op * count) + extra;
 
+	case V64QImode:
+	  /* Ignore the mask load for GF2P8AFFINEQB.  */
+	  extra = 0;
+	  return ix86_vec_cost (mode, cost->sse_op) + extra;
+
 	case V2DImode:
 	case V4DImode:
 	  /* V*DImode arithmetic right shift is emulated.  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ec74f93..951ee54 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -326,6 +326,9 @@
 (define_mode_iterator VI1_AVX512VL
   [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
 
+(define_mode_iterator VI1_AVX512_3264
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX")])
+
 ;; All vector modes
 (define_mode_iterator V
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
@@ -26559,9 +26562,9 @@
 
 ;; XOP packed rotate instructions
 (define_expand "rotl<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-	(rotate:VI_128
-	 (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+	(rotate:VI248_128
+	 (match_operand:VI248_128 1 "nonimmediate_operand")
 	 (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26590,9 +26593,9 @@
 })
 
 (define_expand "rotr<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-	(rotatert:VI_128
-	 (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+	(rotatert:VI248_128
+	 (match_operand:VI248_128 1 "nonimmediate_operand")
 	 (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26964,31 +26967,120 @@
       int i;
 
       if (<CODE> != ASHIFT)
-	{
-	  if (CONST_INT_P (operands[2]))
-	    operands[2] = GEN_INT (-INTVAL (operands[2]));
-	  else
-	    negate = true;
-	}
+       {
+	     if (CONST_INT_P (operands[2]))
+	       operands[2] = GEN_INT (-INTVAL (operands[2]));
+	     else
+	       negate = true;
+	   }
       par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
       tmp = lowpart_subreg (QImode, operands[2], SImode);
       for (i = 0; i < 16; i++)
-	XVECEXP (par, 0, i) = tmp;
+        XVECEXP (par, 0, i) = tmp;
 
       tmp = gen_reg_rtx (V16QImode);
       emit_insn (gen_vec_initv16qiqi (tmp, par));
 
       if (negate)
-	emit_insn (gen_negv16qi2 (tmp, tmp));
+        emit_insn (gen_negv16qi2 (tmp, tmp));
 
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2])
+           && (<MODE_SIZE> == 64
+               || !(INTVAL (operands[2]) == 7 && <CODE> == ASHIFTRT)))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2],
+						   <CODE>);
+      emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+					    GEN_INT (0)));
+    }
   else
     ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
 })
 
+; not generated by vectorizer?
+(define_expand "cond_<insn><mode>"
+  [(set (match_operand:VI1_AVX512VL 0 "register_operand")
+	(vec_merge:VI1_AVX512VL
+	  (any_shift:VI1_AVX512VL
+	    (match_operand:VI1_AVX512VL 2 "register_operand")
+	    (match_operand:VI1_AVX512VL 3 "nonimmediate_or_const_vec_dup_operand"))
+	  (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand")
+	(match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_GFNI && TARGET_AVX512F"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[1], matrix,
+	        GEN_INT (0), operands[4], operands[1]));
+  DONE;
+})
+
+(define_expand "<insn><mode>3"
+  [(set (match_operand:VI1_AVX512_3264 0 "register_operand")
+	(any_rotate:VI1_AVX512_3264
+	  (match_operand:VI1_AVX512_3264 1 "general_operand")
+	  (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_GFNI"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+             GEN_INT (0)));
+  DONE;
+})
+
+(define_expand "<insn>v16qi3"
+  [(set (match_operand:V16QI 0 "register_operand")
+     (any_rotate:V16QI
+       (match_operand:V16QI 1 "nonimmediate_operand")
+       (match_operand:SI 2 "general_operand")))]
+  "TARGET_GFNI || TARGET_XOP"
+{
+  /* Handle the V16QI XOP case to avoid a conflict with the other expand.  */
+  if (TARGET_XOP)
+    {
+      if (! const_0_to_7_operand (operands[2], SImode))
+        {
+          rtvec vs = rtvec_alloc (16);
+          rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+          rtx reg = gen_reg_rtx (V16QImode);
+          rtx op2 = operands[2];
+          int i;
+
+          if (GET_MODE (op2) != QImode)
+            {
+              op2 = gen_reg_rtx (QImode);
+              convert_move (op2, operands[2], false);
+            }
+
+          for (i = 0; i < 16; i++)
+            RTVEC_ELT (vs, i) = op2;
+
+          emit_insn (gen_vec_initv16qiqi (reg, par));
+          if (<CODE> == ROTATERT)
+            {
+              rtx neg = gen_reg_rtx (V16QImode);
+              emit_insn (gen_negv16qi2 (neg, reg));
+              emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg));
+              reg = neg;
+            }
+          emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg));
+          DONE;
+       }
+    }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2]))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+      emit_insn (gen_vgf2p8affineqb_v16qi (operands[0], operands[1], matrix,
+                 GEN_INT (0)));
+      DONE;
+    }
+  else
+    FAIL;
+})
+
 (define_expand "ashrv2di3"
   [(set (match_operand:V2DI 0 "register_operand")
 	(ashiftrt:V2DI
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c
new file mode 100644
index 0000000..e5be3a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O3" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 14 } } */
+
+#ifndef N
+#define N 5
+#endif
+
+void
+ubyteshiftl (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] <<= N;
+}
+
+void
+ubyteshiftr (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] >>= N;
+}
+
+void
+ubyteshiftl_mask (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    if (a[i] & 1)
+      a[i] <<= N;
+}
+
+void
+sbyteshiftl (signed char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] <<= N;
+}
+
+void
+sbyteshiftr (signed char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] >>= N;
+}
+
+void
+ubyteror (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] = a[i] << N | a[i] >> (8 - N);
+}
+
+void
+ubyterol (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] = a[i] >> N | a[i] << (8 - N);
+}
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c
new file mode 100644
index 0000000..098361a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c
@@ -0,0 +1,196 @@
+/* { dg-do run } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O3 -Wno-shift-count-negative" } */
+
+#include <string.h>
+
+#ifndef N1
+#define N1 5
+#endif
+
+#ifndef N2
+#define N2 3
+#endif
+
+#ifndef N3
+#define N3 1
+#endif
+
+#ifndef N4
+#define N4 7
+#endif
+
+#ifndef N5
+#define N5 -3
+#endif
+
+#ifndef FILLER
+#define FILLER 0xab
+#endif
+
+#define FUNC(N)							\
+  void ubyteshiftl##N(unsigned char *a, int len)		\
+  {								\
+    int i;							\
+    for (i = 0; i < len; i++)					\
+      a[i] <<= N;						\
+  }								\
+								\
+  void ubyteshiftr##N(unsigned char *a, int len)		\
+  {								\
+    int i;							\
+    for (i = 0; i < len; i++)					\
+      a[i] >>= N;						\
+  }								\
+								\
+  void ubyteshiftl_mask##N(unsigned char *a, int len)		\
+  {								\
+    int i;							\
+    for (i = 0; i < len; i++)					\
+      if (a[i] & 1)						\
+	a[i] <<= N;						\
+  }								\
+								\
+  void sbyteshiftl##N(signed char *a, int len)			\
+  {								\
+    int i;							\
+    for (i = 0; i < len; i++)					\
+      a[i] <<= N;						\
+  }								\
+								\
+  void sbyteshiftr##N(signed char *a, int len)			\
+  {								\
+    int i;							\
+    for (i = 0; i < len; i++)					\
+      a[i] >>= N;						\
+  }								\
+								\
+  void ubyteror##N(unsigned char *a, int len)			\
+  {								\
+    int i;							\
+    for (i = 0; i < len; i++)					\
+      a[i] = a[i] << N | a[i] >> (8-N);				\
+  }								\
+								\
+  void ubyterol##N(unsigned char *a, int len)			\
+  {								\
+    int i;							\
+    for (i = 0; i < len; i++)					\
+      a[i] = a[i] >> N | a[i] << (8-N);				\
+  }								\
+  void ubyteshiftl##N##ref(unsigned char *a, int len)		\
+  {								\
+    int i;							\
+    _Pragma("GCC novector")					\
+      for (i = 0; i < len; i++)					\
+	a[i] <<= N;						\
+  }								\
+								\
+  void ubyteshiftr##N##ref(unsigned char *a, int len)		\
+  {								\
+    int i;							\
+    _Pragma("GCC novector")					\
+      for (i = 0; i < len; i++)					\
+	a[i] >>= N;						\
+  }								\
+								\
+  void ubyteshiftl_mask##N##ref(unsigned char *a, int len)	\
+  {								\
+    int i;							\
+    _Pragma("GCC novector")					\
+      for (i = 0; i < len; i++)					\
+	if (a[i] & 1)						\
+	  a[i] <<= N;						\
+  }								\
+								\
+  void sbyteshiftl##N##ref(signed char *a, int len)		\
+  {								\
+    int i;							\
+    _Pragma("GCC novector")					\
+      for (i = 0; i < len; i++)					\
+	a[i] <<= N;						\
+  }								\
+								\
+  void sbyteshiftr##N##ref(signed char *a, int len)		\
+  {								\
+    int i;							\
+    _Pragma("GCC novector")					\
+      for (i = 0; i < len; i++)					\
+	a[i] >>= N;						\
+  }								\
+								\
+  void ubyteror##N##ref(unsigned char *a, int len)		\
+  {								\
+    int i;							\
+    _Pragma("GCC novector")					\
+      for (i = 0; i < len; i++)					\
+	a[i] = a[i] << N | a[i] >> (8-N);			\
+  }								\
+								\
+  void ubyterol##N##ref(unsigned char *a, int len)		\
+  {								\
+    int i;							\
+    _Pragma("GCC novector")					\
+      for (i = 0; i < len; i++)					\
+	a[i] = a[i] >> N | a[i] << (8-N);			\
+  }
+
+FUNC (N1)
+FUNC (N2)
+FUNC (N3)
+FUNC (N4)
+FUNC (N5)
+
+#define TEST(N, func)					\
+  memset (array, filler, len);				\
+  func##N (array, len);					\
+  memset (array2, filler, len);				\
+  func##N##ref (array2, len);				\
+  if (memcmp (array, array2, len)) __builtin_abort ()
+
+int main ()
+{
+  __builtin_cpu_init ();
+  if (!__builtin_cpu_supports ("gfni"))
+    return 0;
+
+  const unsigned long len = 256;
+  char array[len], array2[len];
+  unsigned char filler = FILLER;
+
+  TEST (N1, ubyteshiftl);
+  TEST (N1, ubyteshiftl_mask);
+  TEST (N1, sbyteshiftl);
+  TEST (N1, sbyteshiftr);
+  TEST (N1, ubyteror);
+  TEST (N1, ubyterol);
+
+  TEST (N2, ubyteshiftl);
+  TEST (N2, ubyteshiftl_mask);
+  TEST (N2, sbyteshiftl);
+  TEST (N2, sbyteshiftr);
+  TEST (N2, ubyteror);
+  TEST (N2, ubyterol);
+
+  TEST (N3, ubyteshiftl);
+  TEST (N3, ubyteshiftl_mask);
+  TEST (N3, sbyteshiftl);
+  TEST (N3, sbyteshiftr);
+  TEST (N3, ubyteror);
+  TEST (N3, ubyterol);
+
+  TEST (N4, ubyteshiftl);
+  TEST (N4, ubyteshiftl_mask);
+  TEST (N4, sbyteshiftl);
+  TEST (N4, sbyteshiftr);
+  TEST (N4, ubyteror);
+  TEST (N4, ubyterol);
+
+  TEST (N5, ubyteshiftl);
+  TEST (N5, ubyteshiftl_mask);
+  TEST (N5, sbyteshiftl);
+  TEST (N5, sbyteshiftr);
+  TEST (N5, ubyteror);
+  TEST (N5, ubyterol);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c
new file mode 100644
index 0000000..9e5ae5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512bw -mavx512f -O3" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 12 } } */
+
+/* Based on a test case from Andrew Pinski */
+
+#ifndef N
+#define N 5
+#endif
+
+void
+ubyteshiftl (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) << N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i];
+    }
+}
+
+void
+ubyteshiftr (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) >> N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i];
+    }
+}
+
+void
+sbyteshiftl (signed char *restrict a, signed char *restrict b, signed char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) << N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i];
+    }
+}
+
+void
+sbyteshiftr (signed char *restrict a, signed char *restrict b, signed char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) >> N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i];
+    }
+}
+
+static inline unsigned char rol8(unsigned char v, int c)
+{
+  return (v >> c) | (v << (8-c));
+}
+
+static inline unsigned char ror8(unsigned char v, int c)
+{
+  return (v << c) | (v >> (8-c));
+}
+
+void
+ubyterol (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? rol8(a[i] | b[i], N) : a[i];
+      a[i] = (!c[i]) ? rol8(a[i] ^ b[i], N) : a[i];
+    }
+}
+
+void
+ubyteror (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? ror8(a[i] | b[i], N) : a[i];
+      a[i] = (!c[i]) ? ror8(a[i] ^ b[i], N) : a[i];
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-5.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-5.c
new file mode 100644
index 0000000..65fb692
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-5.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx -O3 -Wno-shift-count-negative" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 31 } } */
+
+#include "shift-gf2p8affine-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-6.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-6.c
new file mode 100644
index 0000000..3391deb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-6.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -O3 -Wno-shift-count-negative" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 0 } } */
+
+#include "shift-gf2p8affine-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-7.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-7.c
new file mode 100644
index 0000000..37ba0c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-7.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O3 -Wno-shift-count-negative" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 53 } } */
+
+#include "shift-gf2p8affine-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/shift-v16qi-4.c b/gcc/testsuite/gcc.target/i386/shift-v16qi-4.c
new file mode 100644
index 0000000..edc2b21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-v16qi-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O2" } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 1 } } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi
+foo (v16qi a)
+{
+  return a >> 7;
+}