13 files changed, 1904 insertions, 570 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 09aa9b1..3278f1f 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3151,7 +3151,7 @@ ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
 }
 
 /* Expand floating point op0 <=> op1, i.e.
-   dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2.  */
+   dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128.  */
 
 void
 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
@@ -3264,7 +3264,7 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
   if (l2)
     {
       emit_label (l2);
-      emit_move_insn (dest, op2 == const0_rtx ? const2_rtx : op2);
+      emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
     }
   emit_label (lend);
 }
@@ -8241,8 +8241,10 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem,
       unsigned HOST_WIDE_INT countval = UINTVAL (count);
       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
       unsigned int destalign = MEM_ALIGN (destmem);
+      cfun->machine->by_pieces_in_use = true;
       move_by_pieces (destmem, srcmem, epilogue_size, destalign,
 		      RETURN_BEGIN);
+      cfun->machine->by_pieces_in_use = false;
       return;
     }
   if (max_size > 8)
@@ -8405,8 +8407,8 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
 
 /* Callback routine for store_by_pieces.  Return the RTL of a register
    containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
-   is a word or a word vector register.  If PREV_P isn't nullptr, it
-   has the RTL info from the previous iteration.  */
+   is an integer or a word vector register.  If PREV_P isn't nullptr,
+   it has the RTL info from the previous iteration.  */
 
 static rtx
 setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
@@ -8435,10 +8437,6 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
   rtx op = (rtx) op_p;
   machine_mode op_mode = GET_MODE (op);
 
-  gcc_assert (op_mode == word_mode
-	      || (VECTOR_MODE_P (op_mode)
-		  && GET_MODE_INNER (op_mode) == word_mode));
-
   if (VECTOR_MODE_P (mode))
     {
       gcc_assert (GET_MODE_INNER (mode) == QImode);
@@ -8460,16 +8458,17 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
       return tmp;
     }
 
-  target = gen_reg_rtx (word_mode);
   if (VECTOR_MODE_P (op_mode))
     {
+      gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
+      target = gen_reg_rtx (word_mode);
       op = gen_rtx_SUBREG (word_mode, op, 0);
       emit_move_insn (target, op);
     }
   else
     target = op;
 
-  if (mode == word_mode)
+  if (mode == GET_MODE (target))
     return target;
 
   rtx tmp = gen_reg_rtx (mode);
@@ -8490,9 +8489,11 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
       unsigned HOST_WIDE_INT countval = UINTVAL (count);
       unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
       unsigned int destalign = MEM_ALIGN (destmem);
+      cfun->machine->by_pieces_in_use = true;
       store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
 		       vec_value ? vec_value : value, destalign, true,
 		       RETURN_BEGIN);
+      cfun->machine->by_pieces_in_use = false;
       return;
     }
   if (max_size > 32)
@@ -9574,8 +9575,9 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
     case vector_loop:
       need_zero_guard = true;
       unroll_factor = 4;
-      /* Get the vector mode to move MOVE_MAX bytes.  */
-      nunits = MOVE_MAX / GET_MODE_SIZE (word_mode);
+      /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes.  */
+      nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
+      nunits /= GET_MODE_SIZE (word_mode);
       if (nunits > 1)
 	{
 	  move_mode = mode_for_vector (word_mode, nunits).require ();
@@ -27033,6 +27035,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
   return target;
 }
 
+/* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
+
+static const uint64_t matrix_ashift[8] =
+{
+  0,
+  0x0001020408102040, /* 1 l */
+  0x0000010204081020, /* 2 l */
+  0x0000000102040810, /* 3 l */
+  0x0000000001020408, /* 4 l */
+  0x0000000000010204, /* 5 l */
+  0x0000000000000102, /* 6 l */
+  0x0000000000000001  /* 7 l */
+};
+
+static const uint64_t matrix_lshiftrt[8] =
+{
+  0,
+  0x0204081020408000, /* 1 r */
+  0x0408102040800000, /* 2 r */
+  0x0810204080000000, /* 3 r */
+  0x1020408000000000, /* 4 r */
+  0x2040800000000000, /* 5 r */
+  0x4080000000000000, /* 6 r */
+  0x8000000000000000  /* 7 r */
+};
+
+static const uint64_t matrix_ashiftrt[8] =
+{
+  0,
+  0x0204081020408080, /* 1 r */
+  0x0408102040808080, /* 2 r */
+  0x0810204080808080, /* 3 r */
+  0x1020408080808080, /* 4 r */
+  0x2040808080808080, /* 5 r */
+  0x4080808080808080, /* 6 r */
+  0x8080808080808080  /* 7 r */
+};
+
+static const uint64_t matrix_rotate[8] =
+{
+  0,
+  0x8001020408102040, /* 1 rol8 */
+  0x4080010204081020, /* 2 rol8 */
+  0x2040800102040810, /* 3 rol8 */
+  0x1020408001020408, /* 4 rol8 */
+  0x0810204080010204, /* 5 rol8 */
+  0x0408102040800102, /* 6 rol8 */
+  0x0204081020408001  /* 7 rol8 */
+};
+
+static const uint64_t matrix_rotatert[8] =
+{
+  0,
+  0x0204081020408001, /* 1 ror8 */
+  0x0408102040800102, /* 2 ror8 */
+  0x0810204080010204, /* 3 ror8 */
+  0x1020408001020408, /* 4 ror8 */
+  0x2040800102040810, /* 5 ror8 */
+  0x4080010204081020, /* 6 ror8 */
+  0x8001020408102040  /* 7 ror8 */
+};
+
+/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
+   for CODE and shift count COUNT into register with vector of size of SRC.  */
+
+rtx
+ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
+{
+  machine_mode mode = GET_MODE (src);
+  const uint64_t *matrix;
+  unsigned shift = INTVAL (count) & 7;
+  gcc_assert (shift > 0 && shift < 8);
+
+  switch (code)
+    {
+    case ASHIFT:
+      matrix = matrix_ashift;
+      break;
+    case ASHIFTRT:
+      matrix = matrix_ashiftrt;
+      break;
+    case LSHIFTRT:
+      matrix = matrix_lshiftrt;
+      break;
+    case ROTATE:
+      matrix = matrix_rotate;
+      break;
+    case ROTATERT:
+      matrix = matrix_rotatert;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  int nelts = GET_MODE_NUNITS (mode);
+  rtvec vec = rtvec_alloc (nelts);
+  uint64_t ma = matrix[shift];
+  for (int i = 0; i < nelts; i++)
+    RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
+
+  return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
+}
+
 /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
 
 void
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c131577..0608dd2 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3085,21 +3085,68 @@ ix86_rpad_gate ()
 	  && optimize_function_for_speed_p (cfun));
 }
 
+enum x86_cse_kind
+{
+  X86_CSE_CONST0_VECTOR,
+  X86_CSE_CONSTM1_VECTOR,
+  X86_CSE_VEC_DUP,
+  X86_CSE_TLS_GD,
+  X86_CSE_TLS_LD_BASE,
+  X86_CSE_TLSDESC
+};
+
+struct redundant_pattern
+{
+  /* Bitmap of basic blocks with broadcast instructions.  */
+  auto_bitmap bbs;
+  /* Bitmap of broadcast instructions.  */
+  auto_bitmap insns;
+  /* The broadcast inner scalar.  */
+  rtx val;
+  /* The actual redundant source value for UNSPEC_TLSDESC.  */
+  rtx tlsdesc_val;
+  /* The inner scalar mode.  */
+  machine_mode mode;
+  /* The instruction which sets the inner scalar.  Nullptr if the inner
+     scalar is applied to the whole function, instead of within the same
+     block.  */
+  rtx_insn *def_insn;
+  /* The widest broadcast source.  */
+  rtx broadcast_source;
+  /* The widest broadcast register.  */
+  rtx broadcast_reg;
+  /* The basic block of the broadcast instruction.  */
+  basic_block bb;
+  /* The number of broadcast instructions with the same inner scalar.  */
+  unsigned HOST_WIDE_INT count;
+  /* The threshold of broadcast instructions with the same inner
+     scalar.  */
+  unsigned int threshold;
+  /* The widest broadcast size in bytes.  */
+  unsigned int size;
+  /* Load kind.  */
+  x86_cse_kind kind;
+};
+
 /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
    for basic block map BBS, which is in the fake loop that contains the
    whole function, so that there is only a single vector set in the
-   whole function.  If not nullptr, INNER_SCALAR is the inner scalar of
-   SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)).  */
+   whole function.  If not nullptr, LOAD is a pointer to the load.  */
 
 static void
 ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
-			      rtx inner_scalar = nullptr)
+			      redundant_pattern *load = nullptr)
 {
   basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
-  while (bb->loop_father->latch
-	 != EXIT_BLOCK_PTR_FOR_FN (cfun))
-    bb = get_immediate_dominator (CDI_DOMINATORS,
-				  bb->loop_father->header);
+  /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
+     to avoid extra spills.  */
+  if (!load || load->kind != X86_CSE_VEC_DUP)
+    {
+      while (bb->loop_father->latch
+	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
+	bb = get_immediate_dominator (CDI_DOMINATORS,
+				      bb->loop_father->header);
+    }
 
   rtx set = gen_rtx_SET (dest, src);
 
@@ -3141,8 +3188,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
 	}
     }
 
-  if (inner_scalar)
+  if (load && load->kind == X86_CSE_VEC_DUP)
     {
+      /* Get the source from LOAD as (reg:SI 99) in
+
+	 (vec_duplicate:V4SI (reg:SI 99))
+
+       */
+      rtx inner_scalar = load->val;
       /* Set the source in (vec_duplicate:V4SI (reg:SI 99)).  */
       rtx reg = XEXP (src, 0);
       if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
@@ -3226,7 +3279,7 @@ remove_partial_avx_dependency (void)
 	      break;
 	    }
 
-	  /* Only hanlde conversion here.  */
+	  /* Only handle conversion here.  */
 	  machine_mode src_mode
 	    = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
 	  switch (src_mode)
@@ -3489,44 +3542,6 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
     }
 }
 
-enum x86_cse_kind
-{
-  X86_CSE_CONST0_VECTOR,
-  X86_CSE_CONSTM1_VECTOR,
-  X86_CSE_VEC_DUP
-};
-
-struct redundant_load
-{
-  /* Bitmap of basic blocks with broadcast instructions.  */
-  auto_bitmap bbs;
-  /* Bitmap of broadcast instructions.  */
-  auto_bitmap insns;
-  /* The broadcast inner scalar.  */
-  rtx val;
-  /* The inner scalar mode.  */
-  machine_mode mode;
-  /* The instruction which sets the inner scalar.  Nullptr if the inner
-     scalar is applied to the whole function, instead of within the same
-     block.  */
-  rtx_insn *def_insn;
-  /* The widest broadcast source.  */
-  rtx broadcast_source;
-  /* The widest broadcast register.  */
-  rtx broadcast_reg;
-  /* The basic block of the broadcast instruction.  */
-  basic_block bb;
-  /* The number of broadcast instructions with the same inner scalar.  */
-  unsigned HOST_WIDE_INT count;
-  /* The threshold of broadcast instructions with the same inner
-     scalar.  */
-  unsigned int threshold;
-  /* The widest broadcast size in bytes.  */
-  unsigned int size;
-  /* Load kind.  */
-  x86_cse_kind kind;
-};
-
 /* Return the inner scalar if OP is a broadcast, else return nullptr.  */
 
 static rtx
@@ -3629,6 +3644,8 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
 	 Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
 	 integer constant.  */
       op = src;
+      if (mode != GET_MODE (reg))
+	op = gen_int_mode (INTVAL (src), mode);
       *insn_p = nullptr;
     }
   else
@@ -3669,25 +3686,719 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
   return op;
 }
 
-/* At entry of the nearest common dominator for basic blocks with vector
-   CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
-   vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
-   uses.
+/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
+   put the updated instruction in UPDATED_TLS_INSNS.  */
 
-   NB: We want to generate only a single widest vector set to cover the
-   whole function.  The LCM algorithm isn't appropriate here since it
-   may place a vector set inside the loop.  */
+static void
+replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
+		  auto_bitmap &updated_tls_insns)
+{
+  bitmap_iterator bi;
+  unsigned int id;
 
-static unsigned int
-remove_redundant_vector_load (void)
+  EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
+    {
+      rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+      /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
+	 allowed.  */
+      if (!CALL_P (insn))
+	{
+	  attr_tls64 tls64 = get_attr_tls64 (insn);
+	  if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
+	    gcc_unreachable ();
+	}
+
+      rtx pat = PATTERN (insn);
+      gcc_assert (GET_CODE (pat) == PARALLEL);
+      rtx set = XVECEXP (pat, 0, 0);
+      gcc_assert (GET_CODE (set) == SET);
+      rtx dest = SET_DEST (set);
+
+      set = gen_rtx_SET (dest, src);
+      rtx_insn *set_insn = emit_insn_after (set, insn);
+      if (recog_memoized (set_insn) < 0)
+	gcc_unreachable ();
+
+      /* Put SET_INSN in UPDATED_TLS_INSNS.  */
+      bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
+
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nReplace:\n\n");
+	  print_rtl_single (dump_file, insn);
+	  fprintf (dump_file, "\nwith:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\n");
+	}
+
+      /* Delete the CALL insn.  */
+      delete_insn (insn);
+
+      df_insn_rescan (set_insn);
+    }
+}
+
+/* Return the basic block which dominates all basic blocks which set
+   hard register REGNO used in basic block BB.  */
+
+static basic_block
+ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
+{
+  basic_block set_bb;
+  auto_bitmap set_bbs;
+
+  /* Get all BBs which set REGNO and dominate the current BB from all
+     DEFs of REGNO.  */
+  for (df_ref def = DF_REG_DEF_CHAIN (regno);
+       def;
+       def = DF_REF_NEXT_REG (def))
+    if (!DF_REF_IS_ARTIFICIAL (def)
+	&& !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
+	&& !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
+      {
+	set_bb = DF_REF_BB (def);
+	if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
+	  bitmap_set_bit (set_bbs, set_bb->index);
+      }
+
+  bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+  return bb;
+}
+
+/* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
+   registers, if DEST is FLAGS register.  */
+
+static void
+ix86_check_flags_reg (rtx dest, const_rtx, void *data)
+{
+  auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
+  if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
+    bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
+}
+
+/* Emit a TLS_SET instruction of KIND in basic block BB.   Store the
+   insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
+   for emit_insn_after.  UPDATED_GNU_TLS_INSNS contains instructions
+   which replace the GNU TLS instructions.  UPDATED_GNU2_TLS_INSNS
+   contains instructions which replace the GNU2 TLS instructions.  */
+
+static rtx_insn *
+ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
+		    rtx_insn **before_p, rtx_insn **after_p,
+		    auto_bitmap &updated_gnu_tls_insns,
+		    auto_bitmap &updated_gnu2_tls_insns)
+{
+  rtx_insn *tls_insn;
+
+  do
+    {
+      rtx_insn *insn = BB_HEAD (bb);
+      while (insn && !NONDEBUG_INSN_P (insn))
+	{
+	  if (insn == BB_END (bb))
+	    {
+	      /* This must be the beginning basic block:
+
+		 (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+		 or a basic block with only a label:
+
+		 (code_label 78 11 77 3 14 (nil) [1 uses])
+		 (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+		 or a basic block with only a debug marker:
+
+		 (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+		 (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+	       */
+	      gcc_assert (DEBUG_INSN_P (insn)
+			  || (NOTE_P (insn)
+			      && ((NOTE_KIND (insn)
+				   == NOTE_INSN_FUNCTION_BEG)
+				  || (NOTE_KIND (insn)
+				      == NOTE_INSN_BASIC_BLOCK))));
+	      insn = NULL;
+	      break;
+	    }
+	  insn = NEXT_INSN (insn);
+	}
+
+      /* TLS_GD and TLS_LD_BASE instructions are normal functions which
+	 clobber caller-saved registers.  TLSDESC instructions only
+	 clobber FLAGS.  If any registers clobbered by TLS instructions
+	 are live in this basic block, we must insert TLS instructions
+	 after all live registers clobbered are dead.  */
+
+      auto_bitmap live_caller_saved_regs;
+      bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
+
+      if (bitmap_bit_p (in, FLAGS_REG))
+	bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
+
+      unsigned int i;
+
+      /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
+	 instructions.  */
+      if (kind != X86_CSE_TLSDESC)
+	for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+	  if (call_used_regs[i]
+	      && !fixed_regs[i]
+	      && bitmap_bit_p (in, i))
+	    bitmap_set_bit (live_caller_saved_regs, i);
+
+      if (bitmap_empty_p (live_caller_saved_regs))
+	{
+	  if (insn == BB_HEAD (bb))
+	    {
+	      *before_p = insn;
+	      tls_insn = emit_insn_before (tls_set, insn);
+	    }
+	  else
+	    {
+	      /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
+		 beginning basic block:
+
+		 (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+		 or after NOTE_INSN_BASIC_BLOCK in a basic block with
+		 only a label:
+
+		 (code_label 78 11 77 3 14 (nil) [1 uses])
+		 (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+		 or after debug marker in a basic block with only a
+		 debug marker:
+
+		 (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+		 (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+		 (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+	       */
+	      insn = insn ? PREV_INSN (insn) : BB_END (bb);
+	      *after_p = insn;
+	      tls_insn = emit_insn_after (tls_set, insn);
+	    }
+	  return tls_insn;
+	}
+
+      bool repeat = false;
+
+      /* Search for REG_DEAD notes in this basic block.  */
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  /* NB: Conditional jump is the only instruction which reads
+	     flags register and changes control flow.  We can never
+	     place the TLS call after unconditional jump.  */
+	  if (JUMP_P (insn))
+	    {
+	      /* This must be a conditional jump.  */
+	      rtx label = JUMP_LABEL (insn);
+	      if (label == nullptr
+		  || ANY_RETURN_P (label)
+		  || !(LABEL_P (label) || SYMBOL_REF_P (label)))
+		gcc_unreachable ();
+
+	      /* Place the call before all FLAGS_REG setting BBs since
+		 we can't place a call before nor after a conditional
+		 jump.  */
+	      bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
+
+	      /* Start over again.  */
+	      repeat = true;
+	      break;
+	    }
+
+	  if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
+	    {
+	      /* Insert the __tls_get_addr call before INSN which
+		 replaces a __tls_get_addr call.  */
+	      *before_p = insn;
+	      tls_insn = emit_insn_before (tls_set, insn);
+	      return tls_insn;
+	    }
+
+	  if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
+	    {
+	      /* Mark FLAGS register as dead since FLAGS register
+		 would be clobbered by the GNU2 TLS instruction.  */
+	      bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
+	      continue;
+	    }
+
+	  /* Check if FLAGS register is live.  */
+	  note_stores (insn, ix86_check_flags_reg,
+		       &live_caller_saved_regs);
+
+	  rtx link;
+	  for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+	    if (REG_NOTE_KIND (link) == REG_DEAD
+		&& REG_P (XEXP (link, 0)))
+	      {
+		/* Mark the live caller-saved register as dead.  */
+		for (i = REGNO (XEXP (link, 0));
+		     i < END_REGNO (XEXP (link, 0));
+		     i++)
+		  if (i < FIRST_PSEUDO_REGISTER)
+		    bitmap_clear_bit (live_caller_saved_regs, i);
+
+		if (bitmap_empty_p (live_caller_saved_regs))
+		  {
+		    *after_p = insn;
+		    tls_insn = emit_insn_after (tls_set, insn);
+		    return tls_insn;
+		  }
+	      }
+	}
+
+      /* NB: Start over again for conditional jump.  */
+      if (repeat)
+	continue;
+
+      gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
+
+      /* If any live caller-saved registers aren't dead at the end of
+	 this basic block, get the basic block which dominates all
+	 basic blocks which set the remaining live registers.  */
+      auto_bitmap set_bbs;
+      bitmap_iterator bi;
+      unsigned int id;
+      EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
+	{
+	  basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
+	  bitmap_set_bit (set_bbs, set_bb->index);
+	}
+      bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+    }
+  while (true);
+}
+
+/* Generate a TLS call of KIND with VAL and copy the call result to DEST,
+   at entry of the nearest dominator for basic block map BBS, which is in
+   the fake loop that contains the whole function, so that there is only
+   a single TLS CALL of KIND with VAL in the whole function.
+   UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
+   instructions.  UPDATED_GNU2_TLS_INSNS contains instructions which
+   replace the GNU2 TLS instructions.  If TLSDESC_SET isn't nullptr,
+   insert it before the TLS call.  */
+
+static void
+ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
+			    auto_bitmap &bbs,
+			    auto_bitmap &updated_gnu_tls_insns,
+			    auto_bitmap &updated_gnu2_tls_insns,
+			    rtx tlsdesc_set = nullptr)
+{
+  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+  while (bb->loop_father->latch
+	 != EXIT_BLOCK_PTR_FOR_FN (cfun))
+    bb = get_immediate_dominator (CDI_DOMINATORS,
+				  bb->loop_father->header);
+
+  rtx rax = nullptr, rdi;
+  rtx eqv = nullptr;
+  rtx caddr;
+  rtx set;
+  rtx clob;
+  rtx symbol;
+  rtx tls;
+
+  switch (kind)
+    {
+    case X86_CSE_TLS_GD:
+      rax = gen_rtx_REG (Pmode, AX_REG);
+      rdi = gen_rtx_REG (Pmode, DI_REG);
+      caddr = ix86_tls_get_addr ();
+
+      symbol = XVECEXP (val, 0, 0);
+      tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
+
+      if (GET_MODE (symbol) != Pmode)
+	symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
+      eqv = symbol;
+      break;
+
+    case X86_CSE_TLS_LD_BASE:
+      rax = gen_rtx_REG (Pmode, AX_REG);
+      rdi = gen_rtx_REG (Pmode, DI_REG);
+      caddr = ix86_tls_get_addr ();
+
+      tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
+
+      /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
+	 to share the LD_BASE result with other LD model accesses.  */
+      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+			    UNSPEC_TLS_LD_BASE);
+
+      break;
+
+    case X86_CSE_TLSDESC:
+      set = gen_rtx_SET (dest, val);
+      clob = gen_rtx_CLOBBER (VOIDmode,
+			      gen_rtx_REG (CCmode, FLAGS_REG));
+      tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Emit the TLS CALL insn.  */
+  rtx_insn *before = nullptr;
+  rtx_insn *after = nullptr;
+  rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
+					   &after,
+					   updated_gnu_tls_insns,
+					   updated_gnu2_tls_insns);
+
+  rtx_insn *tlsdesc_insn = nullptr;
+  if (tlsdesc_set)
+    {
+      rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
+      rtx src = copy_rtx (SET_SRC (tlsdesc_set));
+      tlsdesc_set = gen_rtx_SET (dest, src);
+      tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
+    }
+
+  if (kind != X86_CSE_TLSDESC)
+    {
+      RTL_CONST_CALL_P (tls_insn) = 1;
+
+      /* Indicate that this function can't jump to non-local gotos.  */
+      make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
+    }
+
+  if (recog_memoized (tls_insn) < 0)
+    gcc_unreachable ();
+
+  if (dump_file)
+    {
+      if (after)
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  if (tlsdesc_insn)
+	    print_rtl_single (dump_file, tlsdesc_insn);
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\nafter:\n\n");
+	  print_rtl_single (dump_file, after);
+	  fprintf (dump_file, "\n");
+	}
+      else
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  if (tlsdesc_insn)
+	    print_rtl_single (dump_file, tlsdesc_insn);
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\nbefore:\n\n");
+	  print_rtl_single (dump_file, before);
+	  fprintf (dump_file, "\n");
+	}
+    }
+
+  if (kind != X86_CSE_TLSDESC)
+    {
+      /* Copy RAX to DEST.  */
+      set = gen_rtx_SET (dest, rax);
+      rtx_insn *set_insn = emit_insn_after (set, tls_insn);
+      set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "\nPlace:\n\n");
+	  print_rtl_single (dump_file, set_insn);
+	  fprintf (dump_file, "\nafter:\n\n");
+	  print_rtl_single (dump_file, tls_insn);
+	  fprintf (dump_file, "\n");
+	}
+    }
+}
+
+namespace {
+
+const pass_data pass_data_x86_cse =
+{
+  RTL_PASS, /* type */
+  "x86_cse", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_x86_cse : public rtl_opt_pass
+{
+public:
+  pass_x86_cse (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_x86_cse, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *fun) final override
+    {
+      return (TARGET_SSE2
+	      && optimize
+	      && optimize_function_for_speed_p (fun));
+    }
+
+  unsigned int execute (function *) final override
+    {
+      return x86_cse ();
+    }
+
+private:
+  /* The redundant source value.  */
+  rtx val;
+  /* The actual redundant source value for UNSPEC_TLSDESC.  */
+  rtx tlsdesc_val;
+  /* The instruction which defines the redundant value.  */
+  rtx_insn *def_insn;
+  /* Mode of the destination of the candidate redundant instruction.  */
+  machine_mode mode;
+  /* Mode of the source of the candidate redundant instruction.  */
+  machine_mode scalar_mode;
+  /* The classification of the candidate redundant instruction.  */
+  x86_cse_kind kind;
+
+  unsigned int x86_cse (void);
+  bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
+  bool candidate_gnu2_tls_p (rtx, attr_tls64);
+  bool candidate_vector_p (rtx);
+  rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
+}; // class pass_x86_cse
+
+/* Return the instruction which sets REG from TLS_SYMBOL.  */
+
+rtx_insn *
+pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
+					const_rtx tls_symbol)
+{
+  rtx_insn *set_insn = nullptr;
+  for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
+       ref;
+       ref = DF_REF_NEXT_REG (ref))
+    {
+      if (DF_REF_IS_ARTIFICIAL (ref))
+	return nullptr;
+
+      set_insn = DF_REF_INSN (ref);
+      if (get_attr_tls64 (set_insn) != TLS64_LEA)
+	return nullptr;
+
+      rtx tls_set = PATTERN (set_insn);
+      rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
+      if (!rtx_equal_p (tls_symbol, tls_src))
+	return nullptr;
+    }
+
+  return set_insn;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+   INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE.  */
+
+bool
+pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
+{
+  if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+    return false;
+
+  /* Record the redundant TLS CALLs for 64-bit:
+
+     (parallel [
+	(set (reg:DI 0 ax)
+	     (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+		      (const_int 0 [0])))
+	(unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+		    (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
+	(clobber (reg:DI 5 di))])
+
+
+     and
+
+     (parallel [
+	(set (reg:DI 0 ax)
+	     (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+		      (const_int 0 [0])))
+	(unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
+
+   */
+
+  rtx pat = PATTERN (insn);
+  rtx set = XVECEXP (pat, 0, 0);
+  gcc_assert (GET_CODE (set) == SET);
+  rtx dest = SET_DEST (set);
+  scalar_mode = mode = GET_MODE (dest);
+  val = XVECEXP (pat, 0, 1);
+  gcc_assert (GET_CODE (val) == UNSPEC);
+
+  if (tls64 == TLS64_GD)
+    kind = X86_CSE_TLS_GD;
+  else
+    kind = X86_CSE_TLS_LD_BASE;
+
+  def_insn = nullptr;
+  return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+   SET is UNSPEC_TLSDESC.  */
+
+bool
+pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
+{
+  if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+    return false;
+
+  rtx tls_symbol;
+  rtx_insn *set_insn;
+  rtx src = SET_SRC (set);
+  val = src;
+  tlsdesc_val = src;
+  kind = X86_CSE_TLSDESC;
+
+  if (tls64 == TLS64_COMBINE)
+    {
+      /* Record 64-bit TLS64_COMBINE:
+
+	 (set (reg/f:DI 104)
+	      (plus:DI (unspec:DI [
+			  (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  (reg:DI 114)
+			  (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+		       (const:DI (unspec:DI [
+				    (symbol_ref:DI ("e") [flags 0x1a])
+				  ] UNSPEC_DTPOFF))))
+
+	 (set (reg/f:DI 104)
+	      (plus:DI (unspec:DI [
+			  (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  (unspec:DI [
+			     (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+			  ] UNSPEC_TLSDESC)
+			  (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+		       (const:DI (unspec:DI [
+				    (symbol_ref:DI ("e") [flags 0x1a])
+				 ] UNSPEC_DTPOFF))))
+     */
+
+      scalar_mode = mode = GET_MODE (src);
+
+      /* Since the first operand of PLUS in the source TLS_COMBINE
+	 pattern is unused, use the second operand of PLUS:
+
+	 (const:DI (unspec:DI [
+		      (symbol_ref:DI ("e") [flags 0x1a])
+		   ] UNSPEC_DTPOFF))
+
+	 as VAL to check if 2 TLS_COMBINE patterns have the same
+	 source.  */
+      val = XEXP (src, 1);
+      gcc_assert (GET_CODE (val) == CONST
+		  && GET_CODE (XEXP (val, 0)) == UNSPEC
+		      && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
+		      && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
+      def_insn = nullptr;
+      return true;
+    }
+
+  /* Record 64-bit TLS_CALL:
+
+     (set (reg:DI 101)
+	  (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+		      (reg:DI 112)
+		      (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+   */
+
+  gcc_assert (GET_CODE (src) == UNSPEC);
+  tls_symbol = XVECEXP (src, 0, 0);
+  src = XVECEXP (src, 0, 1);
+  scalar_mode = mode = GET_MODE (src);
+  gcc_assert (REG_P (src));
+
+  /* All definitions of reg:DI 129 in
+
+     (set (reg:DI 110)
+	  (unspec:DI [(symbol_ref:DI ("foo"))
+		      (reg:DI 129)
+		      (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+     should have the same source as in
+
+     (set (reg:DI 129)
+	  (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
+
+   */
+
+  set_insn = tls_set_insn_from_symbol (src, tls_symbol);
+  if (!set_insn)
+    return false;
+
+  /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source.  */
+  val = tls_symbol;
+  def_insn = set_insn;
+  return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+  INSN is a vector broadcast instruction.  */
+
+bool
+pass_x86_cse::candidate_vector_p (rtx set)
+{
+  rtx src = SET_SRC (set);
+  rtx dest = SET_DEST (set);
+  mode = GET_MODE (dest);
+  /* Skip non-vector instruction.  */
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  /* Skip non-vector load instruction.  */
+  if (!REG_P (dest) && !SUBREG_P (dest))
+    return false;
+
+  val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
+			      &def_insn);
+  return val ? true : false;
+}
+
+/* At entry of the nearest common dominator for basic blocks with
+
+   1. Vector CONST0_RTX patterns.
+   2. Vector CONSTM1_RTX patterns.
+   3. Vector broadcast patterns.
+   4. UNSPEC_TLS_GD patterns.
+   5. UNSPEC_TLS_LD_BASE patterns.
+   6. UNSPEC_TLSDESC patterns.
+
+   generate a single pattern whose destination is used to replace the
+   source in all identical patterns.
+
+   NB: We want to generate a pattern, which is executed only once, to
+   cover the whole function.  The LCM algorithm isn't appropriate here
+   since it may place a pattern inside the loop.  */
+
+unsigned int
+pass_x86_cse::x86_cse (void)
 {
   timevar_push (TV_MACH_DEP);
 
-  auto_vec<redundant_load *> loads;
-  redundant_load *load;
+  auto_vec<redundant_pattern *> loads;
+  redundant_pattern *load;
   basic_block bb;
   rtx_insn *insn;
   unsigned int i;
+  auto_bitmap updated_gnu_tls_insns;
+  auto_bitmap updated_gnu2_tls_insns;
 
   df_set_flags (DF_DEFER_INSN_RESCAN);
 
@@ -3700,61 +4411,74 @@ remove_redundant_vector_load (void)
 	  if (!NONDEBUG_INSN_P (insn))
 	    continue;
 
+	  bool matched = false;
+	  /* Remove redundant pattens if there are more than 2 of
+	     them.  */
+	  unsigned int threshold = 2;
+
 	  rtx set = single_set (insn);
-	  if (!set)
+	  if (!set && !CALL_P (insn))
 	    continue;
 
-	  /* Record single set vector instruction with CONST0_RTX and
-	     CONSTM1_RTX source.  Record basic blocks with CONST0_RTX and
-	     CONSTM1_RTX.  Count CONST0_RTX and CONSTM1_RTX.  Record the
-	     maximum size of CONST0_RTX and CONSTM1_RTX.  */
+	  tlsdesc_val = nullptr;
 
-	  rtx dest = SET_DEST (set);
-	  machine_mode mode = GET_MODE (dest);
-	  /* Skip non-vector instruction.  */
-	  if (!VECTOR_MODE_P (mode))
-	    continue;
+	  attr_tls64 tls64 = get_attr_tls64 (insn);
+	  switch (tls64)
+	    {
+	    case TLS64_GD:
+	    case TLS64_LD_BASE:
+	      /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE.  */
+	      if (candidate_gnu_tls_p (insn, tls64))
+		break;
+	      continue;
 
-	  rtx src = SET_SRC (set);
-	  /* Skip non-vector load instruction.  */
-	  if (!REG_P (dest) && !SUBREG_P (dest))
-	    continue;
+	    case TLS64_CALL:
+	    case TLS64_COMBINE:
+	      /* Verify UNSPEC_TLSDESC.  */
+	      if (candidate_gnu2_tls_p (set, tls64))
+		break;
+	      continue;
 
-	  rtx_insn *def_insn;
-	  machine_mode scalar_mode;
-	  x86_cse_kind kind;
-	  rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
-					  &kind, &def_insn);
-	  if (!val)
-	    continue;
+	    case TLS64_LEA:
+	      /* Skip TLS64_LEA.  */
+	      continue;
 
-	   /* Remove redundant register loads if there are more than 2
-	      loads will be used.  */
-	  unsigned int threshold = 2;
+	    case TLS64_NONE:
+	      if (!set)
+		continue;
 
-	  /* Check if there is a matching redundant vector load.   */
-	  bool matched = false;
+	      /* Check for vector broadcast.  */
+	      if (candidate_vector_p (set))
+		break;
+	      continue;
+	    }
+
+	  /* Check if there is a matching redundant load.   */
 	  FOR_EACH_VEC_ELT (loads, i, load)
 	    if (load->val
 		&& load->kind == kind
 		&& load->mode == scalar_mode
 		&& (load->bb == bb
-		    || kind < X86_CSE_VEC_DUP
+		    || kind != X86_CSE_VEC_DUP
 		    /* Non all 0s/1s vector load must be in the same
 		       basic block if it is in a recursive call.  */
 		    || !recursive_call_p)
 		&& rtx_equal_p (load->val, val))
 	      {
-		/* Record vector instruction.  */
+		/* Record instruction.  */
 		bitmap_set_bit (load->insns, INSN_UID (insn));
 
 		/* Record the maximum vector size.  */
-		if (load->size < GET_MODE_SIZE (mode))
+		if (kind <= X86_CSE_VEC_DUP
+		    && load->size < GET_MODE_SIZE (mode))
 		  load->size = GET_MODE_SIZE (mode);
 
 		/* Record the basic block.  */
 		bitmap_set_bit (load->bbs, bb->index);
+
+		/* Increment the count.  */
 		load->count++;
+
 		matched = true;
 		break;
 	      }
@@ -3762,10 +4486,17 @@ remove_redundant_vector_load (void)
 	  if (matched)
 	    continue;
 
-	  /* We see this vector broadcast the first time.  */
-	  load = new redundant_load;
+	  /* We see this instruction the first time.  Record the
+	     redundant source value, its mode, the destination size,
+	     instruction which defines the redundant source value,
+	     instruction basic block and the instruction kind.  */
+	  load = new redundant_pattern;
 
 	  load->val = copy_rtx (val);
+	  if (tlsdesc_val)
+	    load->tlsdesc_val = copy_rtx (tlsdesc_val);
+	  else
+	    load->tlsdesc_val = nullptr;
 	  load->mode = scalar_mode;
 	  load->size = GET_MODE_SIZE (mode);
 	  load->def_insn = def_insn;
@@ -3782,49 +4513,64 @@ remove_redundant_vector_load (void)
     }
 
   bool replaced = false;
-  rtx reg, broadcast_source, broadcast_reg;
   FOR_EACH_VEC_ELT (loads, i, load)
     if (load->count >= load->threshold)
       {
-	machine_mode mode = ix86_get_vector_cse_mode (load->size,
-						      load->mode);
-	broadcast_reg = gen_reg_rtx (mode);
-	if (load->def_insn)
-	  {
-	    /* Replace redundant vector loads with a single vector load
-	       in the same basic block.  */
-	    reg = load->val;
-	    if (load->mode != GET_MODE (reg))
-	      reg = gen_rtx_SUBREG (load->mode, reg, 0);
-	    broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
-	    replace_vector_const (mode, broadcast_reg, load->insns,
-				  load->mode);
-	  }
-	else
+	machine_mode mode;
+	rtx reg, broadcast_source, broadcast_reg;
+	replaced = true;
+	switch (load->kind)
 	  {
-	    /* This is a constant integer/double vector.  If the
-	       inner scalar is 0 or -1, set vector to CONST0_RTX
-	       or CONSTM1_RTX directly.  */
-	    rtx reg;
-	    switch (load->kind)
+	  case X86_CSE_TLS_GD:
+	  case X86_CSE_TLS_LD_BASE:
+	  case X86_CSE_TLSDESC:
+	    broadcast_reg = gen_reg_rtx (load->mode);
+	    replace_tls_call (broadcast_reg, load->insns,
+			      (load->kind == X86_CSE_TLSDESC
+			       ? updated_gnu2_tls_insns
+			       : updated_gnu_tls_insns));
+	    load->broadcast_reg = broadcast_reg;
+	    break;
+
+	  case X86_CSE_CONST0_VECTOR:
+	  case X86_CSE_CONSTM1_VECTOR:
+	  case X86_CSE_VEC_DUP:
+	    mode = ix86_get_vector_cse_mode (load->size, load->mode);
+	    broadcast_reg = gen_reg_rtx (mode);
+	    if (load->def_insn)
 	      {
-	      case X86_CSE_CONST0_VECTOR:
-		broadcast_source = CONST0_RTX (mode);
-		break;
-	      case X86_CSE_CONSTM1_VECTOR:
-		broadcast_source = CONSTM1_RTX (mode);
-		break;
-	      default:
-		reg = gen_reg_rtx (load->mode);
+		/* Replace redundant vector loads with a single vector
+		   load in the same basic block.  */
+		reg = load->val;
+		if (load->mode != GET_MODE (reg))
+		  reg = gen_rtx_SUBREG (load->mode, reg, 0);
 		broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
-		break;
 	      }
+	    else
+	      /* This is a constant integer/double vector.  If the
+		 inner scalar is 0 or -1, set vector to CONST0_RTX
+		 or CONSTM1_RTX directly.  */
+	      switch (load->kind)
+		{
+		case X86_CSE_CONST0_VECTOR:
+		  broadcast_source = CONST0_RTX (mode);
+		  break;
+		case X86_CSE_CONSTM1_VECTOR:
+		  broadcast_source = CONSTM1_RTX (mode);
+		  break;
+		case X86_CSE_VEC_DUP:
+		  reg = gen_reg_rtx (load->mode);
+		  broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
 	    replace_vector_const (mode, broadcast_reg, load->insns,
 				  load->mode);
+	    load->broadcast_source = broadcast_source;
+	    load->broadcast_reg = broadcast_reg;
+	    break;
 	  }
-	load->broadcast_source = broadcast_source;
-	load->broadcast_reg = broadcast_reg;
-	replaced = true;
       }
 
   if (replaced)
@@ -3839,43 +4585,75 @@ remove_redundant_vector_load (void)
       FOR_EACH_VEC_ELT (loads, i, load)
 	if (load->count >= load->threshold)
 	  {
+	    rtx set;
 	    if (load->def_insn)
-	      {
-		/* Insert a broadcast after the original scalar
-		   definition.  */
-		rtx set = gen_rtx_SET (load->broadcast_reg,
-				       load->broadcast_source);
-		insn = emit_insn_after (set, load->def_insn);
-
-		if (cfun->can_throw_non_call_exceptions)
-		  {
-		    /* Handle REG_EH_REGION note in DEF_INSN.  */
-		    rtx note = find_reg_note (load->def_insn,
-					      REG_EH_REGION, nullptr);
-		    if (note)
-		      {
-			control_flow_insns.safe_push (load->def_insn);
-			add_reg_note (insn, REG_EH_REGION,
-				      XEXP (note, 0));
-		      }
-		  }
+	      switch (load->kind)
+		{
+		case X86_CSE_TLSDESC:
+		  ix86_place_single_tls_call (load->broadcast_reg,
+					      load->tlsdesc_val,
+					      load->kind,
+					      load->bbs,
+					      updated_gnu_tls_insns,
+					      updated_gnu2_tls_insns,
+					      PATTERN (load->def_insn));
+		  break;
+		case X86_CSE_VEC_DUP:
+		  /* Insert a broadcast after the original scalar
+		     definition.  */
+		  set = gen_rtx_SET (load->broadcast_reg,
+				     load->broadcast_source);
+		  insn = emit_insn_after (set, load->def_insn);
+
+		  if (cfun->can_throw_non_call_exceptions)
+		    {
+		      /* Handle REG_EH_REGION note in DEF_INSN.  */
+		      rtx note = find_reg_note (load->def_insn,
+						REG_EH_REGION, nullptr);
+		      if (note)
+			{
+			  control_flow_insns.safe_push (load->def_insn);
+			  add_reg_note (insn, REG_EH_REGION,
+					XEXP (note, 0));
+			}
+		    }
 
-		if (dump_file)
-		  {
-		    fprintf (dump_file, "\nAdd:\n\n");
-		    print_rtl_single (dump_file, insn);
-		    fprintf (dump_file, "\nafter:\n\n");
-		    print_rtl_single (dump_file, load->def_insn);
-		    fprintf (dump_file, "\n");
-		  }
-	      }
+		  if (dump_file)
+		    {
+		      fprintf (dump_file, "\nAdd:\n\n");
+		      print_rtl_single (dump_file, insn);
+		      fprintf (dump_file, "\nafter:\n\n");
+		      print_rtl_single (dump_file, load->def_insn);
+		      fprintf (dump_file, "\n");
+		    }
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
 	    else
-	      ix86_place_single_vector_set (load->broadcast_reg,
-					    load->broadcast_source,
-					    load->bbs,
-					    (load->kind == X86_CSE_VEC_DUP
-					     ? load->val
-					     : nullptr));
+	      switch (load->kind)
+		{
+		case X86_CSE_TLS_GD:
+		case X86_CSE_TLS_LD_BASE:
+		case X86_CSE_TLSDESC:
+		  ix86_place_single_tls_call (load->broadcast_reg,
+					      (load->kind == X86_CSE_TLSDESC
+					       ? load->tlsdesc_val
+					       : load->val),
+					      load->kind,
+					      load->bbs,
+					      updated_gnu_tls_insns,
+					      updated_gnu2_tls_insns);
+		  break;
+		case X86_CSE_CONST0_VECTOR:
+		case X86_CSE_CONSTM1_VECTOR:
+		case X86_CSE_VEC_DUP:
+		  ix86_place_single_vector_set (load->broadcast_reg,
+						load->broadcast_source,
+						load->bbs,
+						load);
+		  break;
+		}
 	  }
 
       loop_optimizer_finalize ();
@@ -3905,48 +4683,12 @@ remove_redundant_vector_load (void)
   return 0;
 }
 
-namespace {
-
-const pass_data pass_data_remove_redundant_vector_load =
-{
-  RTL_PASS, /* type */
-  "rrvl", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_MACH_DEP, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  0, /* todo_flags_finish */
-};
-
-class pass_remove_redundant_vector_load : public rtl_opt_pass
-{
-public:
-  pass_remove_redundant_vector_load (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
-  {}
-
-  /* opt_pass methods: */
-  bool gate (function *fun) final override
-    {
-      return (TARGET_SSE2
-	      && optimize
-	      && optimize_function_for_speed_p (fun));
-    }
-
-  unsigned int execute (function *) final override
-    {
-      return remove_redundant_vector_load ();
-    }
-}; // class pass_remove_redundant_vector_load
-
 } // anon namespace
 
 rtl_opt_pass *
-make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+make_pass_x86_cse (gcc::context *ctxt)
 {
-  return new pass_remove_redundant_vector_load (ctxt);
+  return new pass_x86_cse (ctxt);
 }
 
 /* Convert legacy instructions that clobbers EFLAGS to APX_NF
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index 2fedbeb..c2db305 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -91,7 +91,6 @@ VECTOR_MODES (FLOAT, 16);     /*         V8HF V4SF V2DF */
 VECTOR_MODES (FLOAT, 32);     /*   V16HF V8SF V4DF V2TF */
 VECTOR_MODES (FLOAT, 64);     /*  V32HF V16SF V8DF V4TF */
 VECTOR_MODES (FLOAT, 128);    /* V64HF V32SF V16DF V8TF */
-VECTOR_MODES (FLOAT, 256);    /* V128HF V64SF V32DF V16TF */
 VECTOR_MODE (FLOAT, HF, 2);   /* 	      	   V2HF */
 VECTOR_MODE (FLOAT, BF, 2);   /* 	      	   V2BF */
 VECTOR_MODE (FLOAT, HF, 6);   /*		   V6HF */
@@ -102,7 +101,6 @@ VECTOR_MODE (INT, QI, 2);     /*                   V2QI */
 VECTOR_MODE (INT, QI, 12);    /*                  V12QI */
 VECTOR_MODE (INT, QI, 14);    /*                  V14QI */
 VECTOR_MODE (INT, HI, 6);     /*                   V6HI */
-VECTOR_MODE (INT, SI, 64);    /* 		  V64SI */
 
 INT_MODE (OI, 32);
 INT_MODE (XI, 64);
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index ca6bb83..abb5dd7 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1172,6 +1172,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 		   OPT_mrecip,
 		   MASK_RECIP),
 
+    IX86_ATTR_YES ("80387",
+		   OPT_m80387,
+		   MASK_80387),
+
     IX86_ATTR_IX86_YES ("general-regs-only",
 			OPT_mgeneral_regs_only,
 			OPTION_MASK_GENERAL_REGS_ONLY),
@@ -1281,6 +1285,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 
       else if (type == ix86_opt_yes || type == ix86_opt_no)
 	{
+	  opts_set->x_target_flags |= mask;
+
 	  if (type == ix86_opt_no)
 	    opt_set_p = !opt_set_p;
 
@@ -3556,6 +3562,10 @@ ix86_set_current_function (tree fndecl)
 	    isa = "AVX";
 	  else if (cfun->machine->func_type != TYPE_NORMAL)
 	    isa = "SSE";
+	  else if (TARGET_MMX)
+	    isa = "MMX/3Dnow";
+	  else if (TARGET_80387)
+	    isa = "80387";
 	  else
 	    isa = NULL;
 	}
@@ -3615,6 +3625,18 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
       return NULL_TREE;
     }
 
+  if (TARGET_64BIT)
+    {
+      /* Do not warn when emulating the MS ABI.  */
+      if ((TREE_CODE (*node) != FUNCTION_TYPE
+	   && TREE_CODE (*node) != METHOD_TYPE)
+	  || ix86_function_type_abi (*node) != MS_ABI)
+	warning (OPT_Wattributes, "%qE attribute ignored",
+		 name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
   /* Can combine regparm with all attributes but fastcall, and thiscall.  */
   if (is_attribute_p ("regparm", name))
     {
@@ -3627,7 +3649,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
 
       if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
 	{
-	  error ("regparam and thiscall attributes are not compatible");
+	  error ("regparm and thiscall attributes are not compatible");
 	}
 
       cst = TREE_VALUE (args);
@@ -3648,19 +3670,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
       return NULL_TREE;
     }
 
-  if (TARGET_64BIT)
-    {
-      /* Do not warn when emulating the MS ABI.  */
-      if ((TREE_CODE (*node) != FUNCTION_TYPE
-	   && TREE_CODE (*node) != METHOD_TYPE)
-	  || ix86_function_type_abi (*node) != MS_ABI)
-	warning (OPT_Wattributes, "%qE attribute ignored",
-	         name);
-      *no_add_attrs = true;
-      return NULL_TREE;
-    }
-
-  /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
+  /* Can combine fastcall with sseregparm.  */
   if (is_attribute_p ("fastcall", name))
     {
       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3681,8 +3691,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
 	}
     }
 
-  /* Can combine stdcall with fastcall (redundant), regparm and
-     sseregparm.  */
+  /* Can combine stdcall with regparm and sseregparm.  */
   else if (is_attribute_p ("stdcall", name))
     {
       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3732,6 +3741,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
 	{
 	  error ("cdecl and thiscall attributes are not compatible");
 	}
+      if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("regparm and thiscall attributes are not compatible");
+	}
     }
 
   /* Can combine sseregparm with all attributes.  */
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 06f0288..553b46d 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -35,6 +35,6 @@ along with GCC; see the file COPYING3.  If not see
      PR116174.  */
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
 
-  INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load);
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_x86_cse);
   INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
   INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 69bc0ee..bdb8bb9 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -290,6 +290,7 @@ extern rtx ix86_tls_module_base (void);
 extern bool ix86_gpr_tls_address_pattern_p (rtx);
 extern bool ix86_tls_address_pattern_p (rtx);
 extern rtx ix86_rewrite_tls_address (rtx);
+extern rtx ix86_tls_get_addr (void);
 
 extern void ix86_expand_vector_init (bool, rtx, rtx);
 extern void ix86_expand_vector_set (bool, rtx, rtx, int);
@@ -430,8 +431,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
-extern rtl_opt_pass *make_pass_remove_redundant_vector_load
-  (gcc::context *);
+extern rtl_opt_pass *make_pass_x86_cse (gcc::context *);
 extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
 extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
 
@@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap);
 /* In i386-expand.cc.  */
 bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
 				   HOST_WIDE_INT*);
+rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4682db85..471be3e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11382,6 +11382,23 @@ ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
 
   return cost;
 }
+
+/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.  */
+
+bool
+ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
+				     unsigned int align,
+				     enum by_pieces_operation op,
+				     bool speed_p)
+{
+  /* Return true when we are currently expanding memcpy/memset epilogue
+     with move_by_pieces or store_by_pieces.  */
+  if (cfun->machine->by_pieces_in_use)
+    return true;
+
+  return default_use_by_pieces_infrastructure_p (size, align, op,
+						 speed_p);
+}
 
 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
    this is used for to form addresses to local data when -fPIC is in
@@ -12439,9 +12456,31 @@ ix86_tls_index (void)
 
 static GTY(()) rtx ix86_tls_symbol;
 
-static rtx
+rtx
 ix86_tls_get_addr (void)
 {
+  if (cfun->machine->call_saved_registers
+      == TYPE_NO_CALLER_SAVED_REGISTERS)
+    {
+      /* __tls_get_addr doesn't preserve vector registers.  When a
+	 function with no_caller_saved_registers attribute calls
+	 __tls_get_addr, YMM and ZMM registers will be clobbered.
+	 Issue an error and suggest -mtls-dialect=gnu2 in this case.  */
+      if (cfun->machine->func_type == TYPE_NORMAL)
+	error (G_("%<-mtls-dialect=gnu2%> must be used with a function"
+		  " with the %<no_caller_saved_registers%> attribute"));
+      else
+	error (cfun->machine->func_type == TYPE_EXCEPTION
+	       ? G_("%<-mtls-dialect=gnu2%> must be used with an"
+		    " exception service routine")
+	       : G_("%<-mtls-dialect=gnu2%> must be used with an"
+		    " interrupt service routine"));
+      /* Don't issue the same error twice.  */
+      cfun->machine->func_type = TYPE_NORMAL;
+      cfun->machine->call_saved_registers
+	= TYPE_DEFAULT_CALL_SAVED_REGISTERS;
+    }
+
   if (!ix86_tls_symbol)
     {
       const char *sym
@@ -20007,7 +20046,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 	tree utype, ures, vce;
 	utype = unsigned_type_for (TREE_TYPE (arg0));
 	/* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR
-	   instead of ABS_EXPR to hanlde overflow case(TYPE_MIN).  */
+	   instead of ABS_EXPR to handle overflow case(TYPE_MIN).  */
 	ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0);
 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
 	loc = gimple_location (stmt);
@@ -21491,8 +21530,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
   /* Register pair for mask registers.  */
   if (mode == P2QImode || mode == P2HImode)
     return 2;
-  if (mode == V64SFmode || mode == V64SImode)
-    return 4;
+
   return 1;
 }
 
@@ -22081,6 +22119,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    }
 	  /* FALLTHRU */
 	case V32QImode:
+	  if (TARGET_GFNI && constant_op1)
+	    {
+	      /* Use vgf2p8affine.  One extra load for the mask, but in a loop
+		 with enough registers it will be moved out.  So for now don't
+		 account the constant mask load.  This is not quite right
+		 for non loop vectorization.  */
+	      extra = 0;
+	      return ix86_vec_cost (mode, cost->sse_op) + extra;
+	    }
 	  if (TARGET_AVX2)
 	    /* Use vpbroadcast.  */
 	    extra = cost->sse_op;
@@ -22115,6 +22162,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
 	    count = 9;
 	  return ix86_vec_cost (mode, cost->sse_op * count) + extra;
 
+	case V64QImode:
+	  /* Ignore the mask load for GF2P8AFFINEQB.  */
+	  extra = 0;
+	  return ix86_vec_cost (mode, cost->sse_op) + extra;
+
 	case V2DImode:
 	case V4DImode:
 	  /* V*DImode arithmetic right shift is emulated.  */
@@ -23132,7 +23184,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
 	     So current solution is make constant disp as cheap as possible.  */
 	  if (GET_CODE (addr) == PLUS
 	      && x86_64_immediate_operand (XEXP (addr, 1), Pmode)
-	      /* Only hanlde (reg + disp) since other forms of addr are mostly LEA,
+	      /* Only handle (reg + disp) since other forms of addr are mostly LEA,
 		 there's no additional cost for the plus of disp.  */
 	      && register_operand (XEXP (addr, 0), Pmode))
 	    {
@@ -25211,20 +25263,14 @@ asm_preferred_eh_data_format (int code, int global)
   return DW_EH_PE_absptr;
 }
 
-/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+/* Worker for ix86_builtin_vectorization_cost and the fallback calls
+   from ix86_vector_costs::add_stmt_cost.  */
 static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
-                                 tree vectype, int)
+ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost,
+			  machine_mode mode)
 {
-  bool fp = false;
-  machine_mode mode = TImode;
+  bool fp = FLOAT_MODE_P (mode);
   int index;
-  if (vectype != NULL)
-    {
-      fp = FLOAT_TYPE_P (vectype);
-      mode = TYPE_MODE (vectype);
-    }
-
   switch (type_of_cost)
     {
       case scalar_stmt:
@@ -25283,14 +25329,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 			      COSTS_N_INSNS
 				 (ix86_cost->gather_static
 				  + ix86_cost->gather_per_elt
-				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+				    * GET_MODE_NUNITS (mode)) / 2);
 
       case vector_scatter_store:
         return ix86_vec_cost (mode,
 			      COSTS_N_INSNS
 				 (ix86_cost->scatter_static
 				  + ix86_cost->scatter_per_elt
-				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+				    * GET_MODE_NUNITS (mode)) / 2);
 
       case cond_branch_taken:
         return ix86_cost->cond_taken_branch_cost;
@@ -25308,7 +25354,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 
       case vec_construct:
 	{
-	  int n = TYPE_VECTOR_SUBPARTS (vectype);
+	  int n = GET_MODE_NUNITS (mode);
 	  /* N - 1 element inserts into an SSE vector, the possible
 	     GPR -> XMM move is accounted for in add_stmt_cost.  */
 	  if (GET_MODE_BITSIZE (mode) <= 128)
@@ -25336,6 +25382,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
     }
 }
 
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+				 tree vectype, int)
+{
+  machine_mode mode = TImode;
+  if (vectype != NULL)
+    mode = TYPE_MODE (vectype);
+  return ix86_default_vector_cost (type_of_cost, mode);
+}
+
 
 /* This function returns the calling abi specific va_list type node.
    It returns  the FNDECL specific va_list type.  */
@@ -25768,15 +25825,20 @@ private:
   unsigned m_num_sse_needed[3];
   /* Number of 256-bit vector permutation.  */
   unsigned m_num_avx256_vec_perm[3];
+  /* Number of reductions for FMA/DOT_PROD_EXPR/SAD_EXPR  */
+  unsigned m_num_reduc[X86_REDUC_LAST];
+  /* Don't do unroll if m_prefer_unroll is false, default is true.  */
+  bool m_prefer_unroll;
 };
 
 ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
     m_num_gpr_needed (),
     m_num_sse_needed (),
-    m_num_avx256_vec_perm ()
-{
-}
+    m_num_avx256_vec_perm (),
+    m_num_reduc (),
+    m_prefer_unroll (true)
+{}
 
 /* Implement targetm.vectorize.create_costs.  */
 
@@ -25789,7 +25851,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
 unsigned
 ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 				  stmt_vec_info stmt_info, slp_tree node,
-				  tree vectype, int misalign,
+				  tree vectype, int,
 				  vect_cost_model_location where)
 {
   unsigned retval = 0;
@@ -26073,6 +26135,125 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	}
     }
 
+  /* Record number of load/store/gather/scatter in vectorized body.  */
+  if (where == vect_body && !m_costing_for_scalar)
+    {
+      switch (kind)
+	{
+	  /* Emulated gather/scatter or any scalarization.  */
+	case scalar_load:
+	case scalar_stmt:
+	case scalar_store:
+	case vector_gather_load:
+	case vector_scatter_store:
+	  m_prefer_unroll = false;
+	  break;
+
+	case vector_stmt:
+	case vec_to_scalar:
+	  /* Count number of reduction FMA and "real" DOT_PROD_EXPR,
+	     unroll in the vectorizer will enable partial sum.  */
+	  if (stmt_info
+	      && vect_is_reduction (stmt_info)
+	      && stmt_info->stmt)
+	    {
+	      /* Handle __builtin_fma.  */
+	      if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA)
+		{
+		  m_num_reduc[X86_REDUC_FMA] += count;
+		  break;
+		}
+
+	      if (!is_gimple_assign (stmt_info->stmt))
+		break;
+
+	      tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
+	      machine_mode inner_mode = GET_MODE_INNER (mode);
+	      tree rhs1, rhs2;
+	      bool native_vnni_p = true;
+	      gimple* def;
+	      machine_mode mode_rhs;
+	      switch (subcode)
+		{
+		case PLUS_EXPR:
+		case MINUS_EXPR:
+		  if (!fp || !flag_associative_math
+		      || flag_fp_contract_mode != FP_CONTRACT_FAST)
+		    break;
+
+		  /* FMA condition for different modes.  */
+		  if (((inner_mode == DFmode || inner_mode == SFmode)
+		       && !TARGET_FMA && !TARGET_AVX512VL)
+		      || (inner_mode == HFmode && !TARGET_AVX512FP16)
+		      || (inner_mode == BFmode && !TARGET_AVX10_2))
+		    break;
+
+		  /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed
+		     to FMA/FNMA after vectorization.  */
+		  rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+		  rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+		  if (subcode == PLUS_EXPR
+		      && TREE_CODE (rhs1) == SSA_NAME
+		      && (def = SSA_NAME_DEF_STMT (rhs1), true)
+		      && is_gimple_assign (def)
+		      && gimple_assign_rhs_code (def) == MULT_EXPR)
+		    m_num_reduc[X86_REDUC_FMA] += count;
+		  else if (TREE_CODE (rhs2) == SSA_NAME
+			   && (def = SSA_NAME_DEF_STMT (rhs2), true)
+			   && is_gimple_assign (def)
+			   && gimple_assign_rhs_code (def) == MULT_EXPR)
+		    m_num_reduc[X86_REDUC_FMA] += count;
+		  break;
+
+		  /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR,
+		     WIDEN_SUM_EXPR and SAD_EXPR, x86 backend only supports
+		     SAD_EXPR (usad{v16qi,v32qi,v64qi}) and DOT_PROD_EXPR.  */
+		case DOT_PROD_EXPR:
+		  rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+		  mode_rhs = TYPE_MODE (TREE_TYPE (rhs1));
+		  if (mode_rhs == QImode)
+		    {
+		      rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+		      signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1));
+		      signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2));
+
+		      /* vpdpbusd.  */
+		      if (signop1_p != signop2_p)
+			native_vnni_p
+			  = (GET_MODE_SIZE (mode) == 64
+			     ? TARGET_AVX512VNNI
+			     : ((TARGET_AVX512VNNI && TARGET_AVX512VL)
+				|| TARGET_AVXVNNI));
+		      else
+			/* vpdpbssd.  */
+			native_vnni_p
+			  = (GET_MODE_SIZE (mode) == 64
+			     ? TARGET_AVX10_2
+			     : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2));
+		    }
+		  m_num_reduc[X86_REDUC_DOT_PROD] += count;
+
+		  /* Dislike to do unroll and partial sum for
+		     emulated DOT_PROD_EXPR.  */
+		  if (!native_vnni_p)
+		    m_num_reduc[X86_REDUC_DOT_PROD] += 3 * count;
+		  break;
+
+		case SAD_EXPR:
+		  m_num_reduc[X86_REDUC_SAD] += count;
+		  break;
+
+		default:
+		  break;
+		}
+	    }
+
+	default:
+	  break;
+	}
+    }
+
+
   combined_fn cfn;
   if ((kind == vector_stmt || kind == scalar_stmt)
       && stmt_info
@@ -26128,32 +26309,23 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
      (AGU and load ports).  Try to account for this by scaling the
      construction cost by the number of elements involved.  */
   if ((kind == vec_construct || kind == vec_to_scalar)
-      && ((stmt_info
-	   && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
-	       || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
-	   && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
-		&& (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+      && ((node
+	   && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+		 || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
+		     && SLP_TREE_LANES (node) == 1))
+		&& (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
+					(SLP_TREE_REPRESENTATIVE (node))))
 		    != INTEGER_CST))
-	       || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
-		   == VMAT_GATHER_SCATTER)))
-	  || (node
-	      && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
-		    || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
-			&& SLP_TREE_LANES (node) == 1))
-		   && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
-					     (SLP_TREE_REPRESENTATIVE (node))))
-		      != INTEGER_CST))
-		  || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
-		      == VMAT_GATHER_SCATTER)))))
-    {
-      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+	       || mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))))))
+    {
+      stmt_cost = ix86_default_vector_cost (kind, mode);
       stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
     }
   else if ((kind == vec_construct || kind == scalar_to_vec)
 	   && node
 	   && SLP_TREE_DEF_TYPE (node) == vect_external_def)
     {
-      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+      stmt_cost = ix86_default_vector_cost (kind, mode);
       unsigned i;
       tree op;
       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
@@ -26217,7 +26389,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 	  TREE_VISITED (op) = 0;
     }
   if (stmt_cost == -1)
-    stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+    stmt_cost = ix86_default_vector_cost (kind, mode);
 
   if (kind == vec_perm && vectype
       && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
@@ -26288,6 +26460,41 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
 	  && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
 	      > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
 	m_costs[vect_body] = INT_MAX;
+
+      bool any_reduc_p = false;
+      for (int i = 0; i != X86_REDUC_LAST; i++)
+	if (m_num_reduc[i])
+	  {
+	    any_reduc_p = true;
+	    break;
+	  }
+
+      if (any_reduc_p
+	  /* Not much gain for loop with gather and scatter.  */
+	  && m_prefer_unroll
+	  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+	{
+	  unsigned unroll_factor
+	    = OPTION_SET_P (ix86_vect_unroll_limit)
+	    ? ix86_vect_unroll_limit
+	    : ix86_cost->vect_unroll_limit;
+
+	  if (unroll_factor > 1)
+	    {
+	      for (int i = 0 ; i != X86_REDUC_LAST; i++)
+		{
+		  if (m_num_reduc[i])
+		    {
+		      unsigned tmp = CEIL (ix86_cost->reduc_lat_mult_thr[i],
+					   m_num_reduc[i]);
+		      unroll_factor = MIN (unroll_factor, tmp);
+		    }
+		}
+
+	      m_suggested_unroll_factor  = 1 << ceil_log2 (unroll_factor);
+	    }
+	}
+
     }
 
   ix86_vect_estimate_reg_pressure ();
@@ -27171,9 +27378,9 @@ ix86_memtag_can_tag_addresses ()
   return ix86_lam_type != lam_none && TARGET_LP64;
 }
 
-/* Implement TARGET_MEMTAG_TAG_SIZE.  */
+/* Implement TARGET_MEMTAG_TAG_BITSIZE.  */
 unsigned char
-ix86_memtag_tag_size ()
+ix86_memtag_tag_bitsize ()
 {
   return IX86_HWASAN_TAG_SIZE;
 }
@@ -27744,6 +27951,10 @@ static const scoped_attribute_specs *const ix86_attribute_table[] =
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST ix86_address_cost
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+  ix86_use_by_pieces_infrastructure_p
+
 #undef TARGET_OVERLAP_OP_BY_PIECES_P
 #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
 
@@ -28147,8 +28358,8 @@ ix86_libgcc_floating_mode_supported_p
 #undef TARGET_MEMTAG_UNTAGGED_POINTER
 #define TARGET_MEMTAG_UNTAGGED_POINTER ix86_memtag_untagged_pointer
 
-#undef TARGET_MEMTAG_TAG_SIZE
-#define TARGET_MEMTAG_TAG_SIZE ix86_memtag_tag_size
+#undef TARGET_MEMTAG_TAG_BITSIZE
+#define TARGET_MEMTAG_TAG_BITSIZE ix86_memtag_tag_bitsize
 
 #undef TARGET_GEN_CCMP_FIRST
 #define TARGET_GEN_CCMP_FIRST ix86_gen_ccmp_first
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 791f3b9..ac0ce68 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -102,6 +102,15 @@ struct stringop_algs
 #define COSTS_N_BYTES(N) ((N) * 2)
 #endif
 
+
+enum ix86_reduc_unroll_factor{
+  X86_REDUC_FMA,
+  X86_REDUC_DOT_PROD,
+  X86_REDUC_SAD,
+
+  X86_REDUC_LAST
+};
+
 /* Define the specific costs for a given cpu.  NB: hard_register is used
    by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute
    hard register move costs by register allocator.  Relative costs of
@@ -225,6 +234,13 @@ struct processor_costs {
 				   to number of instructions executed in
 				   parallel.  See also
 				   ix86_reassociation_width.  */
+  const unsigned reduc_lat_mult_thr[X86_REDUC_LAST];
+				/* Latency times throughput of
+				   FMA/DOT_PROD_EXPR/SAD_EXPR,
+				   it's used to determine unroll
+				   factor in the vectorizer.  */
+  const unsigned vect_unroll_limit;    /* Limit how much the autovectorizer
+					  may unroll a loop.  */
   struct stringop_algs *memcpy, *memset;
   const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
 					  cost model.  */
@@ -644,7 +660,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
   {"cpu_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \
   {"arch", "%{!march=*:-march=%(VALUE)}"},			   \
   {"arch_32", "%{" OPT_ARCH32 ":%{!march=*:-march=%(VALUE)}}"},	   \
-  {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"},
+  {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"},    \
+  {"tls", "%{!mtls-dialect=*:-mtls-dialect=%(VALUE)}"},
 
 /* Specs for the compiler proper */
 
@@ -2477,9 +2494,9 @@ constexpr wide_int_bitmask PTA_DIAMONDRAPIDS = PTA_GRANITERAPIDS_D
   | PTA_MOVRS | PTA_AMX_MOVRS | PTA_USER_MSR;
 
 constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
-  | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
-  | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
-  | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
+  | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT
+  | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL
+  | PTA_AVX | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
 constexpr wide_int_bitmask PTA_BDVER2 = PTA_BDVER1 | PTA_BMI | PTA_TBM
   | PTA_F16C | PTA_FMA;
 constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT
@@ -2487,13 +2504,13 @@ constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT
 constexpr wide_int_bitmask PTA_BDVER4 = PTA_BDVER3 | PTA_AVX2 | PTA_BMI2
   | PTA_RDRND | PTA_MOVBE | PTA_MWAITX;
 
-constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
-  | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
-  | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2
-  | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT
-  | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
-  | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES | PTA_SHA | PTA_LZCNT
-  | PTA_POPCNT;
+constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
+  | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 |  PTA_POPCNT | PTA_LZCNT
+  | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL
+  | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
+  | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE
+  | PTA_MWAITX | PTA_ADX | PTA_RDSEED | PTA_CLZERO | PTA_CLFLUSHOPT
+  | PTA_XSAVEC | PTA_XSAVES | PTA_SHA;
 constexpr wide_int_bitmask PTA_ZNVER2 = PTA_ZNVER1 | PTA_CLWB | PTA_RDPID
   | PTA_WBNOINVD;
 constexpr wide_int_bitmask PTA_ZNVER3 = PTA_ZNVER2 | PTA_VAES | PTA_VPCLMULQDQ
@@ -2506,19 +2523,19 @@ constexpr wide_int_bitmask PTA_ZNVER5 = PTA_ZNVER4 | PTA_AVXVNNI
   | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_PREFETCHI;
 
 constexpr wide_int_bitmask PTA_BTVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
-  | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16
-  | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
+  | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_LZCNT | PTA_POPCNT
+  | PTA_ABM | PTA_CX16 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
 constexpr wide_int_bitmask PTA_BTVER2 = PTA_BTVER1 | PTA_SSE4_1 | PTA_SSE4_2
   | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_BMI | PTA_F16C | PTA_MOVBE
   | PTA_XSAVEOPT;
 
 constexpr wide_int_bitmask PTA_LUJIAZUI = PTA_64BIT | PTA_MMX | PTA_SSE
-  | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
-  | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI | PTA_BMI2 | PTA_PRFCHW
-  | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE
-  | PTA_ADX | PTA_RDSEED | PTA_POPCNT;
+  | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_LZCNT | PTA_POPCNT | PTA_ABM
+  | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI
+  | PTA_BMI2 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
+  | PTA_RDRND | PTA_MOVBE | PTA_ADX | PTA_RDSEED;
 constexpr wide_int_bitmask PTA_YONGFENG = PTA_LUJIAZUI | PTA_AVX | PTA_AVX2
-  | PTA_F16C | PTA_FMA | PTA_SHA | PTA_LZCNT;
+  | PTA_F16C | PTA_FMA | PTA_SHA;
 
 #ifndef GENERATOR_FILE
 
@@ -2865,6 +2882,9 @@ struct GTY(()) machine_function {
      approximation.  */
   BOOL_BITFIELD tls_descriptor_call_expanded_p : 1;
 
+  /* True if TLS descriptor is called more than once.  */
+  BOOL_BITFIELD tls_descriptor_call_multiple_p : 1;
+
   /* If true, the current function has a STATIC_CHAIN is placed on the
      stack below the return address.  */
   BOOL_BITFIELD static_chain_on_stack : 1;
@@ -2934,6 +2954,9 @@ struct GTY(()) machine_function {
   /* True if this is a recursive function.  */
   BOOL_BITFIELD recursive_function : 1;
 
+  /* True if by_pieces op is currently in use.  */
+  BOOL_BITFIELD by_pieces_in_use : 1;
+
   /* The largest alignment, in bytes, of stack slot actually used.  */
   unsigned int max_used_stack_alignment;
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index eb52699..cea6c15 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -901,6 +901,10 @@
 (define_attr "avx_partial_xmm_update" "false,true"
   (const_string "false"))
 
+;; Define attribute to indicate 64-bit TLS insns.
+(define_attr "tls64" "gd,ld_base,call,combine,lea,none"
+  (const_string "none"))
+
 ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
 (define_attr "use_carry" "0,1" (const_string "0"))
 
@@ -1618,10 +1622,8 @@
 	(compare
 	  (match_operand:QI 0 "nonimmediate_operand" "QBn")
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))]
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%h1, %0|%0, %h1}"
   [(set_attr "addr" "gpr8")
@@ -1632,10 +1634,8 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 0 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")]) 0)
 	  (match_operand:QI 1 "const0_operand")))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t%h0, %h0"
@@ -1657,10 +1657,8 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 0 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")]) 0)
 	  (match_operand:QI 1 "general_operand" "QnBn")))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%1, %h0|%h0, %1}"
@@ -1672,15 +1670,11 @@
   [(set (reg FLAGS_REG)
 	(compare
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 0 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 0 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))]
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
   "ix86_match_ccmode (insn, CCmode)"
   "cmp{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "icmp")
@@ -2968,7 +2962,8 @@
 	(match_operand:SWI248 1 "const_int_operand"))]
   "optimize_insn_for_size_p () && optimize_size > 1
    && operands[1] != const0_rtx
-   && operands[1] != constm1_rtx
+   && (operands[1] != constm1_rtx
+       || (<MODE>mode == DImode && LEGACY_INT_REG_P (operands[0])))
    && IN_RANGE (INTVAL (operands[1]), -128, 127)
    && !ix86_red_zone_used
    && REGNO (operands[0]) != SP_REG"
@@ -3479,10 +3474,8 @@
   [(set (strict_low_part
 	  (match_operand:QI 0 "register_operand" "+Q"))
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0))]
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "mov{b}\t{%h1, %0|%0, %h1}"
   [(set_attr "type" "imov")
@@ -3565,10 +3558,8 @@
 (define_insn "*extzvqi"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn,?R")
 	(subreg:QI
-	  (match_operator:SWI248 2 "extract_operator"
-	    [(match_operand 1 "int248_register_operand" "Q,Q")
-	     (const_int 8)
-	     (const_int 8)]) 0))]
+	  (match_operator:SWI248 2 "extract_high_operator"
+	    [(match_operand 1 "int248_register_operand" "Q,Q")]) 0))]
   ""
 {
   switch (get_attr_type (insn))
@@ -3689,10 +3680,8 @@
 	  (match_operand 0 "int248_register_operand" "+Q")
 	  (const_int 8)
 	  (const_int 8))
-	(match_operator:SWI248 2 "extract_operator"
-	  [(match_operand 1 "int248_register_operand" "Q")
-	   (const_int 8)
-	   (const_int 8)]))]
+	(match_operator:SWI248 2 "extract_high_operator"
+	  [(match_operand 1 "int248_register_operand" "Q")]))]
   ""
   "mov{b}\t{%h1, %h0|%h0, %h1}"
   [(set_attr "type" "imov")
@@ -5259,10 +5248,8 @@
   [(set (match_operand:SWI24 0 "register_operand" "=R")
 	(sign_extend:SWI24
 	  (subreg:QI
-	    (match_operator:SWI248 2 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))]
+	    (match_operator:SWI248 2 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
   ""
   "movs{b<SWI24:imodesuffix>|x}\t{%h1, %0|%0, %h1}"
    [(set_attr "type" "imovx")
@@ -7008,10 +6995,8 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
 	(plus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q,Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)
 	  (match_operand:QI 1 "nonimmediate_operand" "0,!qm")))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -7025,8 +7010,8 @@
      [(set (strict_low_part (match_dup 0))
 	   (plus:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)
 	     (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -7037,29 +7022,25 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
 	(plus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "#"
   "&& reload_completed"
   [(set (strict_low_part (match_dup 0))
 	(subreg:QI
-	  (match_op_dup 4
-	    [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 2) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (strict_low_part (match_dup 0))
 	   (plus:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 1) (const_int 8) (const_int 8)) 0)
 	     (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -7474,10 +7455,8 @@
   [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
 	(plus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)
 	  (match_operand:QI 1 "nonimmediate_operand" "0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -7490,29 +7469,25 @@
   [(set (match_operand:QI 0 "register_operand" "=&Q")
 	(plus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
   "&& reload_completed"
   [(set (match_dup 0)
 	(subreg:QI
-	  (match_op_dup 4
-	    [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 2) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (match_dup 0)
 	   (plus:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 1) (const_int 8) (const_int 8)) 0)
 	   (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -7542,10 +7517,8 @@
 	(subreg:SWI248
 	  (plus:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -7580,8 +7553,8 @@
 	   (subreg:SWI248
 	     (plus:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -7601,15 +7574,11 @@
 	(subreg:SWI248
 	  (plusminus:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "<comm>0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "<comm>0,!Q")]) 0)
 	    (subreg:QI
-	      (match_operator:SWI248 4 "extract_operator"
-		[(match_operand 2 "int248_register_operand" "Q,Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)) 0))
+	      (match_operator:SWI248 4 "extract_high_operator"
+		[(match_operand 2 "int248_register_operand" "Q,Q")]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "@
@@ -7628,11 +7597,11 @@
 	   (subreg:SWI248
 	     (plusminus:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (subreg:QI
-		 (match_op_dup 4
-		   [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0))
+		 (zero_extract:SWI248
+		   (match_dup 2) (const_int 8) (const_int 8)) 0)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -8229,10 +8198,8 @@
 	(minus:QI
 	  (match_operand:QI 1 "nonimmediate_operand" "0,!qm")
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q,Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "@
@@ -8246,8 +8213,8 @@
 	   (minus:QI
 	     (match_dup 0)
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -8257,30 +8224,26 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
 	(minus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "#"
   "&& reload_completed"
   [(set (strict_low_part (match_dup 0))
 	(subreg:QI
-	  (match_op_dup 3
-	    [(match_dup 1) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 1) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (strict_low_part (match_dup 0))
 	   (minus:QI
 	   (match_dup 0)
 	     (subreg:QI
-	       (match_op_dup 4
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -8331,10 +8294,8 @@
 	(minus:QI
 	  (match_operand:QI 1 "nonimmediate_operand" "0")
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "sub{b}\t{%h2, %0|%0, %h2}"
@@ -8346,30 +8307,26 @@
   [(set (match_operand:QI 0 "register_operand" "=&Q")
 	(minus:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
   "&& reload_completed"
   [(set (match_dup 0)
 	(subreg:QI
-	  (match_op_dup 3
-	    [(match_dup 1) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 1) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (match_dup 0)
 	   (minus:QI
 	     (match_dup 0)
 	     (subreg:QI
-	       (match_op_dup 4
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -8384,10 +8341,8 @@
 	(subreg:SWI248
 	  (minus:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -8406,8 +8361,8 @@
 	   (subreg:SWI248
 	     (minus:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -12355,10 +12310,8 @@
 	(compare
 	  (and:QI
 	    (subreg:QI
-	      (match_operator:SWI248 2 "extract_operator"
-		[(match_operand 0 "int248_register_operand" "Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 2 "extract_high_operator"
+		[(match_operand 0 "int248_register_operand" "Q")]) 0)
 	    (match_operand:QI 1 "general_operand" "QnBn"))
 	  (const_int 0)))]
   "ix86_match_ccmode (insn, CCNOmode)"
@@ -12372,15 +12325,11 @@
 	(compare
 	  (and:QI
 	    (subreg:QI
-	      (match_operator:SWI248 2 "extract_operator"
-		[(match_operand 0 "int248_register_operand" "Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 2 "extract_high_operator"
+		[(match_operand 0 "int248_register_operand" "Q")]) 0)
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "Q")
-		 (const_int 8)
-		 (const_int 8)]) 0))
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "Q")]) 0))
 	  (const_int 0)))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "test{b}\t{%h1, %h0|%h0, %h1}"
@@ -12969,10 +12918,8 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
 	(any_logic:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q,Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)
 	  (match_operand:QI 1 "nonimmediate_operand" "0,!qm")))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -12986,8 +12933,8 @@
      [(set (strict_low_part (match_dup 0))
 	   (any_logic:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 2) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 2) (const_int 8) (const_int 8)) 0)
 	     (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -12998,29 +12945,25 @@
   [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
 	(any_logic:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
   "#"
   "&& reload_completed"
   [(set (strict_low_part (match_dup 0))
 	(subreg:QI
-	  (match_op_dup 4
-	    [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 2) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (strict_low_part (match_dup 0))
 	   (any_logic:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 1) (const_int 8) (const_int 8)) 0)
 	     (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -13223,10 +13166,8 @@
   [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
 	(any_logic:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)
 	  (match_operand:QI 1 "nonimmediate_operand" "0")))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -13239,29 +13180,25 @@
   [(set (match_operand:QI 0 "register_operand" "=&Q")
 	(any_logic:QI
 	  (subreg:QI
-	    (match_operator:SWI248 3 "extract_operator"
-	      [(match_operand 1 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)
+	    (match_operator:SWI248 3 "extract_high_operator"
+	      [(match_operand 1 "int248_register_operand" "Q")]) 0)
 	  (subreg:QI
-	    (match_operator:SWI248 4 "extract_operator"
-	      [(match_operand 2 "int248_register_operand" "Q")
-	       (const_int 8)
-	       (const_int 8)]) 0)))
+	    (match_operator:SWI248 4 "extract_high_operator"
+	      [(match_operand 2 "int248_register_operand" "Q")]) 0)))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
   "&& reload_completed"
   [(set (match_dup 0)
 	(subreg:QI
-	  (match_op_dup 4
-	    [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+	  (zero_extract:SWI248
+	    (match_dup 2) (const_int 8) (const_int 8)) 0))
    (parallel
      [(set (match_dup 0)
 	   (any_logic:QI
 	     (subreg:QI
-	       (match_op_dup 3
-		 [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+	       (zero_extract:SWI248
+		 (match_dup 1) (const_int 8) (const_int 8)) 0)
 	   (match_dup 0)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -13291,10 +13228,8 @@
 	(subreg:SWI248
 	  (any_logic:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -13313,8 +13248,8 @@
 	   (subreg:SWI248
 	     (any_logic:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -13328,10 +13263,8 @@
 	(match_operator 5 "compare_operator"
 	  [(any_logic:QI
 	     (subreg:QI
-	       (match_operator:SWI248 3 "extract_operator"
-		 [(match_operand 1 "int248_register_operand" "0,!Q")
-		  (const_int 8)
-		  (const_int 8)]) 0)
+	       (match_operator:SWI248 3 "extract_high_operator"
+		 [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	     (match_operand:QI 2 "general_operand" "QnBn,QnBn"))
 	  (const_int 0)]))
    (set (zero_extract:SWI248
@@ -13341,8 +13274,8 @@
 	(subreg:SWI248
 	  (any_logic:QI
 	    (subreg:QI
-	      (match_op_dup 3
-		[(match_dup 0) (const_int 8) (const_int 8)]) 0)
+	      (zero_extract:SWI248
+		(match_dup 0) (const_int 8) (const_int 8)) 0)
 	    (match_dup 2)) 0))]
   "ix86_match_ccmode (insn, CCNOmode)"
   "@
@@ -13358,9 +13291,9 @@
      [(set (match_dup 4)
 	   (match_op_dup 5
 	     [(any_logic:QI
-	        (subreg:QI
-		  (match_op_dup 3
-		    [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		(subreg:QI
+		  (zero_extract:SWI248
+		    (match_dup 0) (const_int 8) (const_int 8)) 0)
 		(match_dup 2))
 	      (const_int 0)]))
       (set (zero_extract:SWI248
@@ -13368,8 +13301,8 @@
 	   (subreg:SWI248
 	     (any_logic:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 1) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))])]
   ""
   [(set_attr "addr" "gpr8")
@@ -13385,15 +13318,11 @@
 	(subreg:SWI248
 	  (any_logic:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "%0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "%0,!Q")]) 0)
 	    (subreg:QI
-	      (match_operator:SWI248 4 "extract_operator"
-		[(match_operand 2 "int248_register_operand" "Q,Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)) 0))
+	      (match_operator:SWI248 4 "extract_high_operator"
+		[(match_operand 2 "int248_register_operand" "Q,Q")]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "@
@@ -13412,11 +13341,11 @@
 	   (subreg:SWI248
 	     (any_logic:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (subreg:QI
-		 (match_op_dup 4
-		   [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0))
+		 (zero_extract:SWI248
+		   (match_dup 2) (const_int 8) (const_int 8)) 0)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "alu")
@@ -13428,12 +13357,10 @@
 	  (match_operand 0 "int248_register_operand" "+Q,&Q")
 	  (const_int 8)
 	  (const_int 8))
-	(match_operator:SWI248 3 "extract_operator"
+	(match_operator:SWI248 3 "extract_high_operator"
 	  [(any_logic
 	     (match_operand 1 "int248_register_operand" "%0,!Q")
-	     (match_operand 2 "int248_register_operand" "Q,Q"))
-	   (const_int 8)
-	   (const_int 8)]))
+	     (match_operand 2 "int248_register_operand" "Q,Q"))]))
    (clobber (reg:CC FLAGS_REG))]
   "GET_MODE (operands[1]) == GET_MODE (operands[2])"
   "@
@@ -13449,9 +13376,9 @@
    (parallel
      [(set (zero_extract:SWI248
 	     (match_dup 0) (const_int 8) (const_int 8))
-	   (match_op_dup 3
-	     [(any_logic (match_dup 4) (match_dup 2))
-	      (const_int 8) (const_int 8)]))
+	   (zero_extract:SWI248
+	     (any_logic (match_dup 4) (match_dup 2))
+	     (const_int 8) (const_int 8)))
       (clobber (reg:CC FLAGS_REG))])]
   "operands[4] = gen_lowpart (GET_MODE (operands[1]), operands[0]);"
   [(set_attr "type" "alu")
@@ -14696,10 +14623,8 @@
 	(subreg:SWI248
 	  (neg:QI
 	    (subreg:QI
-	      (match_operator:SWI248 2 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)) 0))
+	      (match_operator:SWI248 2 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "@
@@ -14717,8 +14642,8 @@
 	   (subreg:SWI248
 	     (neg:QI
 	       (subreg:QI
-		 (match_op_dup 2
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
   [(set_attr "type" "negnot")
@@ -15350,13 +15275,9 @@
 	  (match_operand 0 "int248_register_operand" "+Q,&Q")
 	  (const_int 8)
 	  (const_int 8))
-	(subreg:SWI248
-	  (not:QI
-	    (subreg:QI
-	      (match_operator:SWI248 2 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)) 0))]
+	(not:SWI248
+	  (match_operator:SWI248 2 "extract_high_operator"
+	    [(match_operand 1 "int248_register_operand" "0,!Q")])))]
   ""
   "@
    not{b}\t%h0
@@ -15369,11 +15290,8 @@
 	  (match_dup 1) (const_int 8) (const_int 8)))
    (set (zero_extract:SWI248
 	  (match_dup 0) (const_int 8) (const_int 8))
-	(subreg:SWI248
-	  (not:QI
-	    (subreg:QI
-	      (match_op_dup 2
-		[(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))]
+	(not:SWI248
+	  (zero_extract:SWI248 (match_dup 0) (const_int 8) (const_int 8))))]
   ""
   [(set_attr "type" "negnot")
    (set_attr "mode" "QI")])
@@ -16720,10 +16638,8 @@
 	(subreg:SWI248
 	  (ashift:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -16757,8 +16673,8 @@
 	   (subreg:SWI248
 	     (ashift:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -18004,10 +17920,8 @@
 	(subreg:SWI248
 	  (any_shiftrt:QI
 	    (subreg:QI
-	      (match_operator:SWI248 3 "extract_operator"
-		[(match_operand 1 "int248_register_operand" "0,!Q")
-		 (const_int 8)
-		 (const_int 8)]) 0)
+	      (match_operator:SWI248 3 "extract_high_operator"
+		[(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
 	    (match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0))
    (clobber (reg:CC FLAGS_REG))]
   ""
@@ -18033,8 +17947,8 @@
 	   (subreg:SWI248
 	     (any_shiftrt:QI
 	       (subreg:QI
-		 (match_op_dup 3
-		   [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+		 (zero_extract:SWI248
+		   (match_dup 0) (const_int 8) (const_int 8)) 0)
 	       (match_dup 2)) 0))
       (clobber (reg:CC FLAGS_REG))])]
   ""
@@ -18388,17 +18302,17 @@
 	(any_rotate:SWI
 	  (match_operand:SWI 1 "const_int_operand")
 	  (subreg:QI
-	    (and
-	      (match_operand 2 "int248_register_operand")
-	      (match_operand 3 "const_int_operand")) 0)))]
+	    (match_operator 4 "and_operator"
+	      [(match_operand 2 "int248_register_operand")
+	       (match_operand 3 "const_int_operand")]) 0)))]
  "(INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode) - 1))
    == GET_MODE_BITSIZE (<MODE>mode) - 1"
- [(set (match_dup 4) (match_dup 1))
+ [(set (match_dup 5) (match_dup 1))
   (set (match_dup 0)
-       (any_rotate:SWI (match_dup 4)
+       (any_rotate:SWI (match_dup 5)
 		       (subreg:QI
-			 (and:SI (match_dup 2) (match_dup 3)) 0)))]
- "operands[4] = gen_reg_rtx (<MODE>mode);")
+			 (match_op_dup 4 [(match_dup 2) (match_dup 3)]) 0)))]
+ "operands[5] = gen_reg_rtx (<MODE>mode);")
 
 (define_insn_and_split "*<insn><mode>3_mask_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand")
@@ -23243,6 +23157,7 @@
   return "call\t{*%p2@GOTPCREL(%%rip)|[QWORD PTR %p2@GOTPCREL[rip]]}";
 }
   [(set_attr "type" "multi")
+   (set_attr "tls64" "gd")
    (set (attr "length")
 	(symbol_ref "TARGET_X32 ? 15 : 16"))])
 
@@ -23281,7 +23196,11 @@
 	       UNSPEC_TLS_GD)
      (clobber (match_operand:P 3 "register_operand"))])]
   "TARGET_64BIT"
-  "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
+{
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 (define_insn "*tls_local_dynamic_base_32_gnu"
   [(set (match_operand:SI 0 "register_operand" "=a")
@@ -23343,6 +23262,7 @@
   return "call\t{*%p1@GOTPCREL(%%rip)|[QWORD PTR %p1@GOTPCREL[rip]]}";
 }
   [(set_attr "type" "multi")
+   (set_attr "tls64" "ld_base")
    (set_attr "length" "12")])
 
 (define_insn "*tls_local_dynamic_base_64_largepic"
@@ -23376,7 +23296,11 @@
       (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)
       (clobber (match_operand:P 2 "register_operand"))])]
   "TARGET_64BIT"
-  "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
+{
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
 
 ;; Local dynamic of a single variable is a lose.  Show combine how
 ;; to convert that back to global dynamic.
@@ -23570,6 +23494,8 @@
   "TARGET_64BIT && TARGET_GNU2_TLS"
 {
   operands[2] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0];
+  if (ix86_tls_descriptor_calls_expanded_in_cfun)
+    cfun->machine->tls_descriptor_call_multiple_p = true;
   ix86_tls_descriptor_calls_expanded_in_cfun = true;
 })
 
@@ -23581,6 +23507,7 @@
   "lea%z0\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}"
   [(set_attr "type" "lea")
    (set_attr "mode" "<MODE>")
+   (set_attr "tls64" "lea")
    (set_attr "length" "7")
    (set_attr "length_address" "4")])
 
@@ -23594,6 +23521,7 @@
   "TARGET_64BIT && TARGET_GNU2_TLS"
   "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}"
   [(set_attr "type" "call")
+   (set_attr "tls64" "call")
    (set_attr "length" "2")
    (set_attr "length_address" "0")])
 
@@ -23615,7 +23543,8 @@
 {
   operands[4] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0];
   emit_insn (gen_tls_dynamic_gnu2_64 (ptr_mode, operands[4], operands[1]));
-})
+}
+  [(set_attr "tls64" "combine")])
 
 (define_split
   [(match_operand 0 "tls_address_pattern")]
@@ -28251,10 +28180,8 @@
 	(match_operator 1 "compare_operator"
 	  [(and:QI
 	     (subreg:QI
-	       (match_operator:SWI248 4 "extract_operator"
-		 [(match_operand 2 "int248_register_operand")
-		  (const_int 8)
-		  (const_int 8)]) 0)
+	       (match_operator:SWI248 4 "extract_high_operator"
+		 [(match_operand 2 "int248_register_operand")]) 0)
 	     (match_operand 3 "const_int_operand"))
 	   (const_int 0)]))]
   "! TARGET_PARTIAL_REG_STALL
@@ -28266,9 +28193,9 @@
 	   (match_op_dup 1
 	     [(and:QI
 		(subreg:QI
-		  (match_op_dup 4 [(match_dup 2)
-				   (const_int 8)
-				   (const_int 8)]) 0)
+		  (zero_extract:SWI248 (match_dup 2)
+				       (const_int 8)
+				       (const_int 8)) 0)
 		(match_dup 3))
 	      (const_int 0)]))
       (set (zero_extract:SWI248 (match_dup 2)
@@ -28277,9 +28204,9 @@
 	   (subreg:SWI248
 	     (and:QI
 	       (subreg:QI
-		 (match_op_dup 4 [(match_dup 2)
-				  (const_int 8)
-				  (const_int 8)]) 0)
+		 (zero_extract:SWI248 (match_dup 2)
+				      (const_int 8)
+				      (const_int 8)) 0)
 	       (match_dup 3)) 0))])])
 
 ;; Don't do logical operations with memory inputs.
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c93c0b1..6bda22f 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1246,6 +1246,10 @@ munroll-only-small-loops
 Target Var(ix86_unroll_only_small_loops) Init(0) Optimization
 Enable conservative small loop unrolling.
 
+-param=ix86-vect-unroll-limit=
+Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param
+Limit how much the autovectorizer may unroll a loop.
+
 mlam=
 Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none)
 -mlam=[none|u48|u57] Instrument meta data position in user data pointers.
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b2d2eec..5dbe444 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1319,6 +1319,9 @@
   (ior (match_operand 0 "nonimmediate_operand")
        (match_test "const_vec_duplicate_p (op)")))
 
+(define_predicate "const_vec_dup_operand"
+       (match_test "const_vec_duplicate_p (op)"))
+
 ;; Return true when OP is either register operand, or any
 ;; CONST_VECTOR.
 (define_predicate "reg_or_const_vector_operand"
@@ -1714,10 +1717,14 @@
 (define_predicate "div_operator"
   (match_code "div"))
 
-;; Return true if this is a and, ior or xor operation.
+;; Return true if this is an and, ior or xor operation.
 (define_predicate "logic_operator"
   (match_code "and,ior,xor"))
 
+;; Return true if this is an and operation.
+(define_predicate "and_operator"
+  (match_code "and"))
+
 ;; Return true if this is a plus, minus, and, ior or xor operation.
 (define_predicate "plusminuslogic_operator"
   (match_code "plus,minus,and,ior,xor"))
@@ -1740,8 +1747,12 @@
 (define_predicate "compare_operator"
   (match_code "compare"))
 
-(define_predicate "extract_operator"
-  (match_code "zero_extract,sign_extract"))
+(define_predicate "extract_high_operator"
+  (match_code "zero_extract,sign_extract,ashiftrt,lshiftrt")
+{
+  return (const8_operand (XEXP (op, 1), VOIDmode)
+	  && (BINARY_P (op) || const8_operand (XEXP (op, 2), VOIDmode)));
+})
 
 ;; Return true if OP is a memory operand, aligned to
 ;; less than its natural alignment.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d88c3d6..73906b8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -326,6 +326,9 @@
 (define_mode_iterator VI1_AVX512VL
   [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
 
+(define_mode_iterator VI1_AVX512_3264
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX")])
+
 ;; All vector modes
 (define_mode_iterator V
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
@@ -21729,6 +21732,19 @@
 	   (const_string "orig")))
    (set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
 
+;; Eliminate redundancy caused by
+;; /* Special case TImode to 128-bit vector conversions via V2DI.  */
+;; in ix86_expand_vector_move
+
+(define_split
+  [(set (match_operand:V2DI 0 "register_operand")
+	(vec_concat:V2DI
+	  (subreg:DI (match_operand:TI 1 "register_operand") 0)
+	  (subreg:DI (match_dup 1) 8)))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  [(set (match_dup 0)
+	(subreg:V2DI (match_dup 1) 0))])
+
 (define_insn "*vec_concatv2di_0"
   [(set (match_operand:V2DI 0 "register_operand"     "=v,v ,x")
 	(vec_concat:V2DI
@@ -26546,9 +26562,9 @@
 
 ;; XOP packed rotate instructions
 (define_expand "rotl<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-	(rotate:VI_128
-	 (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+	(rotate:VI248_128
+	 (match_operand:VI248_128 1 "nonimmediate_operand")
 	 (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26577,9 +26593,9 @@
 })
 
 (define_expand "rotr<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-	(rotatert:VI_128
-	 (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+	(rotatert:VI248_128
+	 (match_operand:VI248_128 1 "nonimmediate_operand")
 	 (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26951,31 +26967,122 @@
       int i;
 
       if (<CODE> != ASHIFT)
-	{
-	  if (CONST_INT_P (operands[2]))
-	    operands[2] = GEN_INT (-INTVAL (operands[2]));
-	  else
-	    negate = true;
-	}
+       {
+	     if (CONST_INT_P (operands[2]))
+	       operands[2] = GEN_INT (-INTVAL (operands[2]));
+	     else
+	       negate = true;
+	   }
       par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
       tmp = lowpart_subreg (QImode, operands[2], SImode);
       for (i = 0; i < 16; i++)
-	XVECEXP (par, 0, i) = tmp;
+        XVECEXP (par, 0, i) = tmp;
 
       tmp = gen_reg_rtx (V16QImode);
       emit_insn (gen_vec_initv16qiqi (tmp, par));
 
       if (negate)
-	emit_insn (gen_negv16qi2 (tmp, tmp));
+        emit_insn (gen_negv16qi2 (tmp, tmp));
 
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2])
+           && (<MODE_SIZE> == 64
+               || !(INTVAL (operands[2]) == 7 && <CODE> == ASHIFTRT)))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2],
+						   <CODE>);
+      emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+					    const0_rtx));
+    }
   else
     ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
 })
 
+(define_expand "cond_<insn><mode>"
+  [(set (match_operand:VI1_AVX512VL 0 "register_operand")
+	(vec_merge:VI1_AVX512VL
+	  (any_shift:VI1_AVX512VL
+	    (match_operand:VI1_AVX512VL 2 "register_operand")
+	    (match_operand:VI1_AVX512VL 3 "const_vec_dup_operand"))
+	  (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand")
+	(match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_GFNI && TARGET_AVX512F"
+{
+  rtx count = XVECEXP (operands[3], 0, 0);
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], count, <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[2], matrix,
+					     const0_rtx, operands[4],
+					     operands[1]));
+  DONE;
+})
+
+(define_expand "<insn><mode>3"
+  [(set (match_operand:VI1_AVX512_3264 0 "register_operand")
+	(any_rotate:VI1_AVX512_3264
+	  (match_operand:VI1_AVX512_3264 1 "register_operand")
+	  (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_GFNI"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+             const0_rtx));
+  DONE;
+})
+
+(define_expand "<insn>v16qi3"
+  [(set (match_operand:V16QI 0 "register_operand")
+     (any_rotate:V16QI
+       (match_operand:V16QI 1 "nonimmediate_operand")
+       (match_operand:SI 2 "general_operand")))]
+  "TARGET_GFNI || TARGET_XOP"
+{
+  /* Handle the V16QI XOP case to avoid a conflict with the other expand.  */
+  if (TARGET_XOP)
+    {
+      if (! const_0_to_7_operand (operands[2], SImode))
+        {
+          rtvec vs = rtvec_alloc (16);
+          rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+          rtx reg = gen_reg_rtx (V16QImode);
+          rtx op2 = operands[2];
+          int i;
+
+          if (GET_MODE (op2) != QImode)
+            {
+              op2 = gen_reg_rtx (QImode);
+              convert_move (op2, operands[2], false);
+            }
+
+          for (i = 0; i < 16; i++)
+            RTVEC_ELT (vs, i) = op2;
+
+          emit_insn (gen_vec_initv16qiqi (reg, par));
+          if (<CODE> == ROTATERT)
+            {
+              rtx neg = gen_reg_rtx (V16QImode);
+              emit_insn (gen_negv16qi2 (neg, reg));
+              emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg));
+              reg = neg;
+            }
+          emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg));
+          DONE;
+       }
+    }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2]))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+      emit_insn (gen_vgf2p8affineqb_v16qi (operands[0],
+					   force_reg (V16QImode, operands[1]),
+					   matrix, const0_rtx));
+      DONE;
+    }
+  else
+    FAIL;
+})
+
 (define_expand "ashrv2di3"
   [(set (match_operand:V2DI 0 "register_operand")
 	(ashiftrt:V2DI
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index c8603b9..1649ea2 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -141,6 +141,12 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (4),			/* cost of CVT(T)PS2PI instruction.  */
   
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   ix86_size_memcpy,
   ix86_size_memset,
   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
@@ -261,6 +267,12 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (27),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (27),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   i386_memcpy,
   i386_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -382,6 +394,12 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (27),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (27),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   i486_memcpy,
   i486_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -501,6 +519,12 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -613,6 +637,12 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (5),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (5),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -740,6 +770,12 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -858,6 +894,12 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   geode_memcpy,
   geode_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -979,6 +1021,12 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (2),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (2),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   k6_memcpy,
   k6_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1101,6 +1149,12 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   athlon_memcpy,
   athlon_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1232,6 +1286,12 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (5),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   k8_memcpy,
   k8_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1371,6 +1431,12 @@ struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   amdfam10_memcpy,
   amdfam10_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -1503,6 +1569,12 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   bdver_memcpy,
   bdver_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1668,6 +1740,12 @@ struct processor_costs znver1_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {5, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver1_memcpy,
   znver1_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1836,6 +1914,12 @@ struct processor_costs znver2_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {10, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1979,6 +2063,12 @@ struct processor_costs znver3_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2125,6 +2215,12 @@ struct processor_costs znver4_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2287,6 +2383,12 @@ struct processor_costs znver5_cost = {
 	We increase width to 6 for multiplications
 	in ix86_reassociation_width.  */
   6, 6, 4, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 6},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2422,6 +2524,12 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (7),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   skylake_memcpy,
   skylake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2559,6 +2667,12 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 10, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   icelake_memcpy,
   icelake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2690,6 +2804,12 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (7),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (6),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   alderlake_memcpy,
   alderlake_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2814,6 +2934,12 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   btver1_memcpy,
   btver1_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -2935,6 +3061,12 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   btver2_memcpy,
   btver2_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -3055,6 +3187,12 @@ struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (12),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   pentium4_memcpy,
   pentium4_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3178,6 +3316,12 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (12),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {1, 1, 1},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   nocona_memcpy,
   nocona_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3299,6 +3443,12 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  2,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   atom_memcpy,
   atom_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3420,6 +3570,12 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   slm_memcpy,
   slm_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3555,6 +3711,12 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (4),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (4),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   tremont_memcpy,
   tremont_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3681,6 +3843,12 @@ struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   lujiazui_memcpy,
   lujiazui_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3805,6 +3973,12 @@ struct processor_costs yongfeng_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   yongfeng_memcpy,
   yongfeng_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3929,6 +4103,12 @@ struct processor_costs shijidadao_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   4, 4, 4, 4,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   shijidadao_memcpy,
   shijidadao_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -4078,6 +4258,12 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (3),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (3),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 8, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  4,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   generic_memcpy,
   generic_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -4215,6 +4401,12 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (6),			/* cost of CVTPI2PS instruction.  */
   COSTS_N_INSNS (7),			/* cost of CVT(T)PS2PI instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  {8, 1, 3},				/* latency times throughput of
+					   FMA/DOT_PROD_EXPR/SAD_EXPR,
+					   it's used to determine unroll
+					   factor in the vectorizer.  */
+  1,					/* Limit how much the autovectorizer
+					   may unroll a loop.  */
   core_memcpy,
   core_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */